[llvm] r319913 - [X86][AVX512] Tag aligned/unaligned move instruction scheduler classes

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 6 09:59:26 PST 2017


Author: rksimon
Date: Wed Dec  6 09:59:26 2017
New Revision: 319913

URL: http://llvm.org/viewvc/llvm-project?rev=319913&view=rev
Log:
[X86][AVX512] Tag aligned/unaligned move instruction scheduler classes

Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=319913&r1=319912&r2=319913&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Wed Dec  6 09:59:26 2017
@@ -3148,42 +3148,40 @@ defm : mask_shift_lowering<VK2, v2i1>, R
 //
 
 
-multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         PatFrag ld_frag, PatFrag mload,
-                         bit NoRMPattern = 0,
-                         SDPatternOperator SelectOprr = vselect> {
+multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+                       X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
+                       bit NoRMPattern = 0,
+                       SDPatternOperator SelectOprr = vselect> {
   let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-                    _.ExeDomain>, EVEX;
+                    _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                       (ins _.KRCWM:$mask,  _.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
                        "${dst} {${mask}} {z}, $src}"),
                        [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                            (_.VT _.RC:$src),
-                                           _.ImmAllZerosV)))], _.ExeDomain>,
-                       EVEX, EVEX_KZ;
+                                           _.ImmAllZerosV)))], _.ExeDomain,
+                       itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>;
 
-  let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1,
-      SchedRW = [WriteLoad] in
+  let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
                           (_.VT (bitconvert (ld_frag addr:$src))))]),
-                    _.ExeDomain>, EVEX;
+                    _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>;
 
   let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
-  def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
-                    (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
-                    !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
-                    "${dst} {${mask}}, $src1}"),
-                    [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
-                                        (_.VT _.RC:$src1),
-                                        (_.VT _.RC:$src0))))], _.ExeDomain>,
-                     EVEX, EVEX_K;
-    let SchedRW = [WriteLoad] in
+    def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+                      (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                      "${dst} {${mask}}, $src1}"),
+                      [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+                                          (_.VT _.RC:$src1),
+                                          (_.VT _.RC:$src0))))], _.ExeDomain,
+                       itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>;
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3191,16 +3189,16 @@ multiclass avx512_load<bits<8> opc, stri
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
                           (_.VT (bitconvert (ld_frag addr:$src1))),
-                           (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+                           (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>,
+                     EVEX, EVEX_K, Sched<[WriteLoad]>;
   }
-  let SchedRW = [WriteLoad] in
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
                     (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
-                  _.ExeDomain>, EVEX, EVEX_KZ;
+                  _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
             (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
@@ -3217,14 +3215,17 @@ multiclass avx512_alignedload_vl<bits<8>
                                   AVX512VLVectorVTInfo _,
                                   Predicate prd> {
   let Predicates = [prd] in
-  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
-                       masked_load_aligned512>, EVEX_V512;
+  defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512,
+                       _.info512.AlignedLdFrag, masked_load_aligned512>,
+                       EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
-                          masked_load_aligned256>, EVEX_V256;
-  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
-                          masked_load_aligned128>, EVEX_V128;
+  defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256,
+                          _.info256.AlignedLdFrag, masked_load_aligned256>,
+                          EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128,
+                          _.info128.AlignedLdFrag, masked_load_aligned128>,
+                          EVEX_V128;
   }
 }
 
@@ -3234,38 +3235,40 @@ multiclass avx512_load_vl<bits<8> opc, s
                                   bit NoRMPattern = 0,
                                   SDPatternOperator SelectOprr = vselect> {
   let Predicates = [prd] in
-  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
+  defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag,
                        masked_load_unaligned, NoRMPattern,
                        SelectOprr>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
+  defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag,
                          masked_load_unaligned, NoRMPattern,
                          SelectOprr>, EVEX_V256;
-  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
+  defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag,
                          masked_load_unaligned, NoRMPattern,
                          SelectOprr>, EVEX_V128;
   }
 }
 
-multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                        PatFrag st_frag, PatFrag mstore, string Name,
-                        bit NoMRPattern = 0> {
-
+multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+                        X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
+                        string Name, bit NoMRPattern = 0> {
   let hasSideEffects = 0 in {
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
                          OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
-                         [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
+                         [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>,
+                         Sched<[WriteMove]>;
   def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                          (ins _.KRCWM:$mask, _.RC:$src),
                          OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
                          "${dst} {${mask}}, $src}",
-                         [], _.ExeDomain>,  EVEX, EVEX_K, FoldGenData<Name#rrk>;
+                         [], _.ExeDomain, itins.rr>,  EVEX, EVEX_K,
+                         FoldGenData<Name#rrk>, Sched<[WriteMove]>;
   def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
                           OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
-                          [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
+                          [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ,
+                          FoldGenData<Name#rrkz>, Sched<[WriteMove]>;
   }
 
   let hasSideEffects = 0, mayStore = 1 in
@@ -3273,11 +3276,11 @@ multiclass avx512_store<bits<8> opc, str
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoMRPattern, [],
                         [(st_frag (_.VT _.RC:$src), addr:$dst)]),
-                    _.ExeDomain>, EVEX;
+                    _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>;
   def mrk : AVX512PI<opc, MRMDestMem, (outs),
                      (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
               OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
-               [], _.ExeDomain>, EVEX, EVEX_K;
+               [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>;
 
   def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
            (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
@@ -3289,14 +3292,14 @@ multiclass avx512_store_vl< bits<8> opc,
                             AVX512VLVectorVTInfo _, Predicate prd,
                             string Name, bit NoMRPattern = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
+  defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store,
                         masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
+    defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store,
                              masked_store_unaligned, Name#Z256,
                              NoMRPattern>, EVEX_V256;
-    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
+    defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store,
                              masked_store_unaligned, Name#Z128,
                              NoMRPattern>, EVEX_V128;
   }
@@ -3306,13 +3309,13 @@ multiclass avx512_alignedstore_vl<bits<8
                                   AVX512VLVectorVTInfo _,  Predicate prd,
                                   string Name> {
   let Predicates = [prd] in
-  defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore,
+  defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore,
                         masked_store_aligned512, Name#Z>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore,
+    defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore,
                              masked_store_aligned256, Name#Z256>, EVEX_V256;
-    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+    defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore,
                              masked_store_aligned128, Name#Z128>, EVEX_V128;
   }
 }
@@ -3381,24 +3384,24 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "v
 let isReMaterializable = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
 def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_RM>;
 def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_RM>;
 def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_RM>;
 def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_RM>;
 }
 
-let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in {
 def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_MR>;
 def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_MR>;
 def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_MR>;
 def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
-                            "", []>;
+                            "", [], IIC_SSE_MOVA_P_MR>;
 }
 
 def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
@@ -3907,8 +3910,8 @@ let hasSideEffects = 0 in {
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
-                           FoldGenData<"VMOVSSZrr">;
+                           [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG,
+                           FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>;
 
 let Constraints = "$src0 = $dst" in
   def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -3916,21 +3919,21 @@ let Constraints = "$src0 = $dst" in
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
-                             FoldGenData<"VMOVSSZrrk">;
+                             [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
                          "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
-                         [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
-                         FoldGenData<"VMOVSSZrrkz">;
+                         [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
-                           FoldGenData<"VMOVSDZrr">;
+                           [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W,
+                           FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>;
 
 let Constraints = "$src0 = $dst" in
   def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -3938,16 +3941,16 @@ let Constraints = "$src0 = $dst" in
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">;
+                             [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                               (ins f64x_info.KRCWM:$mask, VR128X:$src1,
                                                           VR128X:$src2),
                               "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
-                              VEX_W, FoldGenData<"VMOVSDZrrkz">;
+                              [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+                              VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>;
 }
 
 let Predicates = [HasAVX512] in {

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=319913&r1=319912&r2=319913&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Wed Dec  6 09:59:26 2017
@@ -25,9 +25,15 @@ class SizeItins<OpndItins arg_s, OpndIti
   OpndItins d = arg_d;
 }
 
+class MoveLoadStoreItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+                         InstrItinClass arg_mr> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  InstrItinClass mr = arg_mr;
+}
 
 class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
-  InstrItinClass arg_ri> {
+                     InstrItinClass arg_ri> {
   InstrItinClass rr = arg_rr;
   InstrItinClass rm = arg_rm;
   InstrItinClass ri = arg_ri;
@@ -152,10 +158,18 @@ def SSE_MOVA_ITINS : OpndItins<
   IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
 >;
 
+def SSE_MOVA : MoveLoadStoreItins<
+  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR
+>;
+
 def SSE_MOVU_ITINS : OpndItins<
   IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
 >;
 
+def SSE_MOVU : MoveLoadStoreItins<
+  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR
+>;
+
 def SSE_DPPD_ITINS : OpndItins<
   IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
 >;

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=319913&r1=319912&r2=319913&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Wed Dec  6 09:59:26 2017
@@ -986,7 +986,7 @@ define <8 x double> @test_mask_broadcast
 ; GENERIC-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_broadcast_vaddpd:
@@ -994,7 +994,7 @@ define <8 x double> @test_mask_broadcast
 ; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -2082,7 +2082,7 @@ define <16 x double> @uito16f64(<16 x i3
 ; GENERIC-NEXT:    vcvtudq2pd %ymm0, %zmm2 # sched: [4:1.00]
 ; GENERIC-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcvtudq2pd %ymm0, %zmm1 # sched: [4:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: uito16f64:
@@ -2796,7 +2796,7 @@ define <16 x double> @sito16f64(<16 x i3
 ; GENERIC-NEXT:    vcvtdq2pd %ymm0, %zmm2 # sched: [4:1.00]
 ; GENERIC-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcvtdq2pd %ymm0, %zmm1 # sched: [4:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sito16f64:
@@ -4528,7 +4528,7 @@ define void @extload_v8i64(<8 x i8>* %a,
 ; GENERIC-LABEL: extload_v8i64:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; GENERIC-NEXT:    vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4549,18 +4549,18 @@ define <64 x i16> @test21(<64 x i16> %x
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpmovb2m %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    kshiftrq $32, %k1, %k1
-; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z}
+; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test21:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovb2m %zmm2, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    kshiftrq $32, %k1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z}
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
   ret <64 x i16> %ret
@@ -5033,7 +5033,7 @@ define <16 x float> @test_x86_fmadd231_p
 ; GENERIC-NEXT:    vpmovb2m %xmm2, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmulps (%rdi), %zmm0, %zmm0 # sched: [9:1.00]
 ; GENERIC-NEXT:    vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_x86_fmadd231_ps:
@@ -5059,7 +5059,7 @@ define <16 x float> @test_x86_fmadd213_p
 ; GENERIC-NEXT:    vpmovb2m %xmm2, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_x86_fmadd213_ps:
@@ -6175,7 +6175,7 @@ define <16 x i32> @mov_test17(i8 * %addr
 define void @mov_test18(i8 * %addr, <8 x i64> %data) {
 ; GENERIC-LABEL: mov_test18:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6192,7 +6192,7 @@ define void @mov_test18(i8 * %addr, <8 x
 define void @mov_test19(i8 * %addr, <16 x i32> %data) {
 ; GENERIC-LABEL: mov_test19:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6209,7 +6209,7 @@ define void @mov_test19(i8 * %addr, <16
 define void @mov_test20(i8 * %addr, <16 x i32> %data) {
 ; GENERIC-LABEL: mov_test20:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6241,7 +6241,7 @@ define  <8 x i64> @mov_test21(i8 * %addr
 define void @mov_test22(i8 * %addr, <8 x i64> %data) {
 ; GENERIC-LABEL: mov_test22:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6273,7 +6273,7 @@ define <8 x i64> @mov_test23(i8 * %addr)
 define void @mov_test24(i8 * %addr, <8 x double> %data) {
 ; GENERIC-LABEL: mov_test24:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6305,7 +6305,7 @@ define <8 x double> @mov_test25(i8 * %ad
 define void @mov_test26(i8 * %addr, <16 x float> %data) {
 ; GENERIC-LABEL: mov_test26:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6337,7 +6337,7 @@ define <16 x float> @mov_test27(i8 * %ad
 define void @mov_test28(i8 * %addr, <8 x double> %data) {
 ; GENERIC-LABEL: mov_test28:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6369,7 +6369,7 @@ define <8 x double> @mov_test29(i8 * %ad
 define void @mov_test30(i8 * %addr, <16 x float> %data) {
 ; GENERIC-LABEL: mov_test30:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovups %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7454,14 +7454,14 @@ define <32 x i16> @vmov_test21(<32 x i16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vmov_test21:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovb2m %ymm1, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
@@ -7686,7 +7686,7 @@ define <32 x i16> @test_build_vec_v32i1(
 ; GENERIC-NEXT:    movl $1497715861, %eax # imm = 0x59455495
 ; GENERIC-NEXT:    # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %eax, %k1
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_build_vec_v32i1:
@@ -7694,7 +7694,7 @@ define <32 x i16> @test_build_vec_v32i1(
 ; SKX-NEXT:    movl $1497715861, %eax # imm = 0x59455495
 ; SKX-NEXT:    # sched: [1:0.25]
 ; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
@@ -7724,11 +7724,11 @@ define void @ktest_1(<8 x double> %in, d
 ; GENERIC-NEXT:    ktestb %k0, %k0
 ; GENERIC-NEXT:    je .LBB410_2 # sched: [1:1.00]
 ; GENERIC-NEXT:  # %bb.1: # %L1
-; GENERIC-NEXT:    vmovapd %zmm0, (%rdi)
+; GENERIC-NEXT:    vmovapd %zmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB410_2: # %L2
-; GENERIC-NEXT:    vmovapd %zmm0, 8(%rdi)
+; GENERIC-NEXT:    vmovapd %zmm0, 8(%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7793,13 +7793,13 @@ define void @ktest_2(<32 x float> %in, f
 ; GENERIC-NEXT:    ktestd %k0, %k0
 ; GENERIC-NEXT:    je .LBB411_2 # sched: [1:1.00]
 ; GENERIC-NEXT:  # %bb.1: # %L1
-; GENERIC-NEXT:    vmovaps %zmm0, (%rdi)
-; GENERIC-NEXT:    vmovaps %zmm1, 64(%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, 64(%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ; GENERIC-NEXT:  .LBB411_2: # %L2
-; GENERIC-NEXT:    vmovaps %zmm0, 4(%rdi)
-; GENERIC-NEXT:    vmovaps %zmm1, 68(%rdi)
+; GENERIC-NEXT:    vmovaps %zmm0, 4(%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, 68(%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -8341,7 +8341,7 @@ define   <16 x float> @_ss16xfloat_mask(
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_mask:
@@ -8460,7 +8460,7 @@ define   <8 x double> @_sd8xdouble_mask(
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_mask:
@@ -8468,7 +8468,7 @@ define   <8 x double> @_sd8xdouble_mask(
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -8593,7 +8593,7 @@ define <16 x i32> @test_vbroadcast() {
 ; GENERIC-NEXT:    vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpmovm2d %k0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    knotw %k0, %k1
-; GENERIC-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_vbroadcast:
@@ -8602,7 +8602,7 @@ define <16 x i32> @test_vbroadcast() {
 ; SKX-NEXT:    vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    knotw %k0, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   %0 = sext <16 x i1> zeroinitializer to <16 x i32>

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=319913&r1=319912&r2=319913&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Wed Dec  6 09:59:26 2017
@@ -452,7 +452,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask0:
@@ -461,7 +461,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -497,7 +497,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask1:
@@ -506,7 +506,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -542,7 +542,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask2:
@@ -551,7 +551,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -602,7 +602,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask3:
@@ -611,7 +611,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -1304,7 +1304,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask0:
@@ -1313,7 +1313,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1349,7 +1349,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask1:
@@ -1358,7 +1358,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1394,7 +1394,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask2:
@@ -1403,7 +1403,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1454,7 +1454,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask3:
@@ -1463,7 +1463,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2116,7 +2116,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask0:
@@ -2125,7 +2125,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2160,7 +2160,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
@@ -2168,7 +2168,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2202,7 +2202,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask2:
@@ -2211,7 +2211,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2259,7 +2259,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
@@ -2267,7 +2267,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2301,7 +2301,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask4:
@@ -2310,7 +2310,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2345,7 +2345,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
@@ -2353,7 +2353,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2402,7 +2402,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask6:
@@ -2411,7 +2411,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2446,7 +2446,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
@@ -2454,7 +2454,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3327,7 +3327,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask0:
@@ -3372,7 +3372,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask1:
@@ -3417,7 +3417,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask2:
@@ -3477,7 +3477,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask3:
@@ -4139,7 +4139,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask0:
@@ -4148,7 +4148,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4183,7 +4183,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
@@ -4191,7 +4191,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4225,7 +4225,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask2:
@@ -4234,7 +4234,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4282,7 +4282,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
@@ -4290,7 +4290,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4324,7 +4324,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask4:
@@ -4333,7 +4333,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4368,7 +4368,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
@@ -4376,7 +4376,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4425,7 +4425,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask6:
@@ -4434,7 +4434,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4469,7 +4469,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
@@ -4477,7 +4477,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -5733,7 +5733,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask0:
@@ -5741,7 +5741,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5774,7 +5774,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask1:
@@ -5782,7 +5782,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5815,7 +5815,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask2:
@@ -5823,7 +5823,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5869,7 +5869,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask3:
@@ -5877,7 +5877,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -7629,7 +7629,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
@@ -7637,7 +7637,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7670,7 +7670,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
@@ -7678,7 +7678,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7711,7 +7711,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
@@ -7719,7 +7719,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7765,7 +7765,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
@@ -7773,7 +7773,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7806,7 +7806,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
@@ -7814,7 +7814,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7847,7 +7847,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
@@ -7855,7 +7855,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7901,7 +7901,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
@@ -7909,7 +7909,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7942,7 +7942,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
@@ -7950,7 +7950,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -8221,7 +8221,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
@@ -8229,7 +8229,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
@@ -8244,7 +8244,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
@@ -8252,7 +8252,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
@@ -9150,7 +9150,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask0:
@@ -9158,7 +9158,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9191,7 +9191,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask1:
@@ -9199,7 +9199,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9232,7 +9232,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask2:
@@ -9240,7 +9240,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9286,7 +9286,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask3:
@@ -9294,7 +9294,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9930,7 +9930,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
@@ -9971,7 +9971,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
@@ -10012,7 +10012,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
@@ -10066,7 +10066,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
@@ -10121,7 +10121,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
@@ -10165,7 +10165,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
@@ -10209,7 +10209,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
@@ -10267,7 +10267,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
@@ -10718,7 +10718,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
@@ -10726,7 +10726,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10759,7 +10759,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
@@ -10767,7 +10767,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10800,7 +10800,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
@@ -10808,7 +10808,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10854,7 +10854,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
@@ -10862,7 +10862,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10909,7 +10909,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
@@ -10917,7 +10917,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
@@ -10953,7 +10953,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
@@ -10961,7 +10961,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -10997,7 +10997,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
@@ -11005,7 +11005,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
@@ -11055,7 +11055,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
@@ -11063,7 +11063,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
@@ -11506,7 +11506,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask0:
@@ -11514,7 +11514,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11547,7 +11547,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask1:
@@ -11555,7 +11555,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11588,7 +11588,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask2:
@@ -11596,7 +11596,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11642,7 +11642,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask3:
@@ -11650,7 +11650,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11697,7 +11697,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
@@ -11705,7 +11705,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
@@ -11741,7 +11741,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
@@ -11749,7 +11749,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -11785,7 +11785,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
@@ -11793,7 +11793,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
@@ -11843,7 +11843,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
@@ -11851,7 +11851,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -12294,7 +12294,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask0:
@@ -12302,7 +12302,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12335,7 +12335,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask1:
@@ -12343,7 +12343,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12376,7 +12376,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask2:
@@ -12384,7 +12384,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12430,7 +12430,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask3:
@@ -12438,7 +12438,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12485,7 +12485,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
@@ -12493,7 +12493,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
@@ -12529,7 +12529,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
@@ -12537,7 +12537,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
@@ -12573,7 +12573,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
@@ -12581,7 +12581,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
@@ -12631,7 +12631,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
@@ -12639,7 +12639,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
@@ -13476,7 +13476,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
@@ -13517,7 +13517,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
@@ -13558,7 +13558,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
@@ -13612,7 +13612,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
@@ -13667,7 +13667,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
@@ -13711,7 +13711,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
@@ -13755,7 +13755,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
@@ -13813,7 +13813,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
@@ -14461,7 +14461,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
@@ -14469,7 +14469,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14502,7 +14502,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
@@ -14510,7 +14510,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14543,7 +14543,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
@@ -14551,7 +14551,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14597,7 +14597,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
@@ -14605,7 +14605,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14652,7 +14652,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
@@ -14660,7 +14660,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14696,7 +14696,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
@@ -14704,7 +14704,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14740,7 +14740,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
@@ -14748,7 +14748,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14798,7 +14798,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
@@ -14806,7 +14806,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -15643,7 +15643,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
@@ -15684,7 +15684,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
@@ -15725,7 +15725,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
@@ -15779,7 +15779,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
@@ -15834,7 +15834,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
@@ -15878,7 +15878,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
@@ -15922,7 +15922,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
@@ -15980,7 +15980,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
@@ -16628,7 +16628,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
@@ -16636,7 +16636,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16669,7 +16669,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
@@ -16677,7 +16677,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16710,7 +16710,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
@@ -16718,7 +16718,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16764,7 +16764,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
@@ -16772,7 +16772,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16819,7 +16819,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
@@ -16827,7 +16827,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16863,7 +16863,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
@@ -16871,7 +16871,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16907,7 +16907,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
@@ -16915,7 +16915,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16965,7 +16965,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
@@ -16973,7 +16973,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>




More information about the llvm-commits mailing list