[llvm] r329256 - [X86] Revert r329251-329254

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 4 22:19:36 PDT 2018


Author: ctopper
Date: Wed Apr  4 22:19:36 2018
New Revision: 329256

URL: http://llvm.org/viewvc/llvm-project?rev=329256&view=rev
Log:
[X86] Revert r329251-329254

It's failing on the bots and I'm not sure why.

This reverts:

[X86] Synchronize the SchedRW on some EVEX instructions with their VEX equivalents.
[X86] Use WriteFShuffle256 for VEXTRACTF128 to be consistent with VEXTRACTI128 which uses WriteShuffle256.
[X86] Remove some InstRWs for plain store instructions on Sandy Bridge.
[X86] Auto-generate complete checks. NFC

Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
    llvm/trunk/lib/Target/X86/X86SchedHaswell.td
    llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
    llvm/trunk/test/CodeGen/X86/avx-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
    llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
    llvm/trunk/test/CodeGen/X86/fp128-i128.ll
    llvm/trunk/test/CodeGen/X86/memcpy-2.ll
    llvm/trunk/test/CodeGen/X86/misched-matrix.ll
    llvm/trunk/test/CodeGen/X86/mmx-schedule.ll
    llvm/trunk/test/CodeGen/X86/pr31045.ll
    llvm/trunk/test/CodeGen/X86/pr34080.ll
    llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll
    llvm/trunk/test/CodeGen/X86/schedule-x86_64.ll
    llvm/trunk/test/CodeGen/X86/sse-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse2-schedule.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Wed Apr  4 22:19:36 2018
@@ -3193,13 +3193,12 @@ defm : operation_subvector_mask_lowering
 
 multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                        X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
-                       SchedWrite SchedRR, SchedWrite SchedRM,
                        bit NoRMPattern = 0,
                        SDPatternOperator SelectOprr = vselect> {
   let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
-                    _.ExeDomain, itins.rr>, EVEX, Sched<[SchedRR]>;
+                    _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                       (ins _.KRCWM:$mask,  _.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
@@ -3207,7 +3206,7 @@ multiclass avx512_load<bits<8> opc, stri
                        [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                            (_.VT _.RC:$src),
                                            _.ImmAllZerosV)))], _.ExeDomain,
-                       itins.rr>, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+                       itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>;
 
   let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
@@ -3215,7 +3214,7 @@ multiclass avx512_load<bits<8> opc, stri
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
                           (_.VT (bitconvert (ld_frag addr:$src))))]),
-                    _.ExeDomain, itins.rm>, EVEX, Sched<[SchedRM]>;
+                    _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>;
 
   let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
     def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
@@ -3225,7 +3224,7 @@ multiclass avx512_load<bits<8> opc, stri
                       [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                           (_.VT _.RC:$src1),
                                           (_.VT _.RC:$src0))))], _.ExeDomain,
-                       itins.rr>, EVEX, EVEX_K, Sched<[SchedRR]>;
+                       itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>;
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3234,7 +3233,7 @@ multiclass avx512_load<bits<8> opc, stri
                          (vselect _.KRCWM:$mask,
                           (_.VT (bitconvert (ld_frag addr:$src1))),
                            (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>,
-                     EVEX, EVEX_K, Sched<[SchedRM]>;
+                     EVEX, EVEX_K, Sched<[WriteLoad]>;
   }
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
@@ -3242,7 +3241,7 @@ multiclass avx512_load<bits<8> opc, stri
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
                     (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
-                  _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[SchedRM]>;
+                  _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
             (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
@@ -3257,64 +3256,63 @@ multiclass avx512_load<bits<8> opc, stri
 
 multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
                                   AVX512VLVectorVTInfo _,
-                                  Predicate prd, SchedWrite SchedRR,
-                                  SchedWrite SchedRM, bit NoRMPattern = 0> {
+                                  Predicate prd,
+                                  bit NoRMPattern = 0> {
   let Predicates = [prd] in
   defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512,
                        _.info512.AlignedLdFrag, masked_load_aligned512,
-                       SchedRR, SchedRM, NoRMPattern>, EVEX_V512;
+                       NoRMPattern>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
   defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256,
                           _.info256.AlignedLdFrag, masked_load_aligned256,
-                          SchedRR, SchedRM, NoRMPattern>, EVEX_V256;
+                          NoRMPattern>, EVEX_V256;
   defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128,
                           _.info128.AlignedLdFrag, masked_load_aligned128,
-                          SchedRR, SchedRM, NoRMPattern>, EVEX_V128;
+                          NoRMPattern>, EVEX_V128;
   }
 }
 
 multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
                                   AVX512VLVectorVTInfo _,
-                                  Predicate prd, SchedWrite SchedRR,
-                                  SchedWrite SchedRM, bit NoRMPattern = 0,
+                                  Predicate prd,
+                                  bit NoRMPattern = 0,
                                   SDPatternOperator SelectOprr = vselect> {
   let Predicates = [prd] in
   defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag,
-                       masked_load_unaligned, SchedRR, SchedRM, NoRMPattern,
+                       masked_load_unaligned, NoRMPattern,
                        SelectOprr>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
   defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag,
-                         masked_load_unaligned, SchedRR, SchedRM, NoRMPattern,
+                         masked_load_unaligned, NoRMPattern,
                          SelectOprr>, EVEX_V256;
   defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag,
-                         masked_load_unaligned, SchedRR, SchedRM, NoRMPattern,
+                         masked_load_unaligned, NoRMPattern,
                          SelectOprr>, EVEX_V128;
   }
 }
 
 multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
                         X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
-                        string Name, SchedWrite SchedRR, SchedWrite SchedMR,
-                        bit NoMRPattern = 0> {
+                        string Name, bit NoMRPattern = 0> {
   let hasSideEffects = 0 in {
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
                          OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
                          [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>,
-                         Sched<[SchedRR]>;
+                         Sched<[WriteMove]>;
   def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                          (ins _.KRCWM:$mask, _.RC:$src),
                          OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
                          "${dst} {${mask}}, $src}",
                          [], _.ExeDomain, itins.rr>,  EVEX, EVEX_K,
-                         FoldGenData<Name#rrk>, Sched<[SchedRR]>;
+                         FoldGenData<Name#rrk>, Sched<[WriteMove]>;
   def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
                           OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
                           [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ,
-                          FoldGenData<Name#rrkz>, Sched<[SchedRR]>;
+                          FoldGenData<Name#rrkz>, Sched<[WriteMove]>;
   }
 
   let hasSideEffects = 0, mayStore = 1 in
@@ -3322,11 +3320,11 @@ multiclass avx512_store<bits<8> opc, str
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoMRPattern, [],
                         [(st_frag (_.VT _.RC:$src), addr:$dst)]),
-                    _.ExeDomain, itins.mr>, EVEX, Sched<[SchedMR]>;
+                    _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>;
   def mrk : AVX512PI<opc, MRMDestMem, (outs),
                      (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
               OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
-               [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[SchedMR]>;
+               [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>;
 
   def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
            (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
@@ -3336,108 +3334,95 @@ multiclass avx512_store<bits<8> opc, str
 
 multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
                             AVX512VLVectorVTInfo _, Predicate prd,
-                            string Name, SchedWrite SchedRR, SchedWrite SchedMR,
-                            bit NoMRPattern = 0> {
+                            string Name, bit NoMRPattern = 0> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store,
-                        masked_store_unaligned, Name#Z, SchedRR, SchedMR,
-                        NoMRPattern>, EVEX_V512;
+                        masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;
+
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store,
-                             masked_store_unaligned, Name#Z256, SchedRR,
-                             SchedMR, NoMRPattern>, EVEX_V256;
+                             masked_store_unaligned, Name#Z256,
+                             NoMRPattern>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store,
-                             masked_store_unaligned, Name#Z128, SchedRR,
-                             SchedMR, NoMRPattern>, EVEX_V128;
+                             masked_store_unaligned, Name#Z128,
+                             NoMRPattern>, EVEX_V128;
   }
 }
 
 multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
                                   AVX512VLVectorVTInfo _,  Predicate prd,
-                                  string Name, SchedWrite SchedRR,
-                                  SchedWrite SchedMR, bit NoMRPattern = 0> {
+                                  string Name, bit NoMRPattern = 0> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore,
-                        masked_store_aligned512, Name#Z, SchedRR, SchedMR,
+                        masked_store_aligned512, Name#Z,
                         NoMRPattern>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore,
-                             masked_store_aligned256, Name#Z256, SchedRR,
-                             SchedMR, NoMRPattern>, EVEX_V256;
+                             masked_store_aligned256, Name#Z256,
+                             NoMRPattern>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore,
-                             masked_store_aligned128, Name#Z128, SchedRR,
-                             SchedMR, NoMRPattern>, EVEX_V128;
+                             masked_store_aligned128, Name#Z128,
+                             NoMRPattern>, EVEX_V128;
   }
 }
 
 defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
-                                     HasAVX512, WriteFMove, WriteFLoad>,
+                                     HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
-                                      HasAVX512, "VMOVAPS", WriteFMove,
-                                      WriteFStore>,
+                                      HasAVX512, "VMOVAPS">,
                PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512, WriteFMove, WriteFLoad>,
+                                     HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512, "VMOVAPD", WriteFMove,
-                                     WriteFStore>,
+                                     HasAVX512, "VMOVAPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
-                              WriteFMove, WriteFLoad, 0, null_frag>,
+                              0, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
-                               "VMOVUPS", WriteFMove, WriteFStore>,
+                               "VMOVUPS">,
                               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
-                              WriteFMove, WriteFLoad, 0, null_frag>,
+                              0, null_frag>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
-                               "VMOVUPD", WriteFMove, WriteFStore>,
+                               "VMOVUPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, WriteVecMove, WriteVecLoad,
-                                       1>,
+                                       HasAVX512, 1>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, "VMOVDQA32", WriteVecMove,
-                                       WriteVecStore, 1>,
+                                       HasAVX512, "VMOVDQA32", 1>,
                  PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
-                                       HasAVX512, WriteVecMove, WriteVecLoad>,
+                                       HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512, "VMOVDQA64", WriteVecMove,
-                                    WriteVecStore>,
+                                    HasAVX512, "VMOVDQA64">,
                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
-                               WriteVecMove, WriteVecLoad, 1>,
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI, "VMOVDQU8", WriteVecMove,
-                                 WriteVecStore, 1>,
+                                 HasBWI, "VMOVDQU8", 1>,
                 XD, EVEX_CD8<8, CD8VF>;
 
-defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
-                               WriteVecMove, WriteVecLoad, 1>,
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI, "VMOVDQU16", WriteVecMove,
-                                 WriteVecStore, 1>,
+                                 HasBWI, "VMOVDQU16", 1>,
                  XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
-                                WriteVecMove, WriteVecLoad, 1, null_frag>,
+                                1, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512, "VMOVDQU32", WriteVecMove,
-                                 WriteVecStore, 1>,
+                                 HasAVX512, "VMOVDQU32", 1>,
                  XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
-                                WriteVecMove, WriteVecLoad, 0, null_frag>,
+                                0, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512, "VMOVDQU64", WriteVecMove,
-                                 WriteVecStore>,
+                                 HasAVX512, "VMOVDQU64">,
                  XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
@@ -3747,7 +3732,7 @@ let hasSideEffects = 0 in
 def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
                              (ins VR128X:$src),
                              "vmovq.s\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
-                             EVEX, VEX_W, Sched<[WriteVecLogic]>;
+                             EVEX, VEX_W, Sched<[WriteMove]>;
 } // ExeDomain = SSEPackedInt
 
 // Move Scalar Single to Double Int
@@ -3792,7 +3777,7 @@ multiclass avx512_move_scalar<string asm
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
-             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteFShuffle]>;
+             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteMove]>;
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -3800,7 +3785,7 @@ multiclass avx512_move_scalar<string asm
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                       (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                       _.ImmAllZerosV)))],
-              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteFShuffle]>;
+              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteMove]>;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3809,7 +3794,7 @@ multiclass avx512_move_scalar<string asm
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                      (_.VT _.RC:$src0))))],
-             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteFShuffle]>;
+             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteMove]>;
   let canFoldAsLoad = 1, isReMaterializable = 1 in
   def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
@@ -4003,7 +3988,7 @@ let hasSideEffects = 0 in {
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG,
-                           FoldGenData<"VMOVSSZrr">, Sched<[WriteFShuffle]>;
+                           FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>;
 
 let Constraints = "$src0 = $dst" in
   def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4012,20 +3997,20 @@ let Constraints = "$src0 = $dst" in
                              "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG,
-                             FoldGenData<"VMOVSSZrrk">, Sched<[WriteFShuffle]>;
+                             FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
                          "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
                          [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
-                         FoldGenData<"VMOVSSZrrkz">, Sched<[WriteFShuffle]>;
+                         FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W,
-                           FoldGenData<"VMOVSDZrr">, Sched<[WriteFShuffle]>;
+                           FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>;
 
 let Constraints = "$src0 = $dst" in
   def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4034,7 +4019,7 @@ let Constraints = "$src0 = $dst" in
                              "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteFShuffle]>;
+                             VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                               (ins f64x_info.KRCWM:$mask, VR128X:$src1,
@@ -4042,7 +4027,7 @@ let Constraints = "$src0 = $dst" in
                               "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
                               [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
-                              VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteFShuffle]>;
+                              VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -4245,7 +4230,7 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
-let SchedRW = [WriteVecLoad] in {
+let SchedRW = [WriteLoad] in {
   def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                         (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
                         [], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
@@ -4269,7 +4254,7 @@ let SchedRW = [WriteVecLoad] in {
 multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         PatFrag st_frag = alignednontemporalstore,
                         InstrItinClass itin = IIC_SSE_MOVNT> {
-  let SchedRW = [WriteVecStore], AddedComplexity = 400 in
+  let SchedRW = [WriteStore], AddedComplexity = 400 in
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(st_frag (_.VT _.RC:$src), addr:$dst)],

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Wed Apr  4 22:19:36 2018
@@ -7611,7 +7611,7 @@ let hasSideEffects = 0, ExeDomain = SSEP
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, u8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
+          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, u8imm:$src2),

Modified: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedBroadwell.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td Wed Apr  4 22:19:36 2018
@@ -663,6 +663,9 @@ def: InstRW<[BWWriteResGroup10], (instre
                                             "MMX_MOVD64mr",
                                             "MMX_MOVNTQmr",
                                             "MMX_MOVQ64mr",
+                                            "MOV(16|32|64)mr",
+                                            "MOV8mi",
+                                            "MOV8mr",
                                             "MOVNTI_64mr",
                                             "MOVNTImr",
                                             "ST_FP32m",

Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Wed Apr  4 22:19:36 2018
@@ -633,6 +633,9 @@ def: InstRW<[HWWriteResGroup1], (instreg
                                            "MMX_MOVD64mr",
                                            "MMX_MOVNTQmr",
                                            "MMX_MOVQ64mr",
+                                           "MOV(16|32|64)mr",
+                                           "MOV8mi",
+                                           "MOV8mr",
                                            "MOVNTI_64mr",
                                            "MOVNTImr",
                                            "ST_FP32m",

Modified: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td Wed Apr  4 22:19:36 2018
@@ -925,8 +925,30 @@ def SBWriteResGroup33 : SchedWriteRes<[S
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8",
-                                            "PUSH(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOV(8|16|32|64)mr",
+                                            "MOVNTI_64mr",
+                                            "MOVNTImr",
+                                            "PUSH64i8",
+                                            "PUSH(16|32|64)r",
+                                            "VEXTRACTF128mr",
+                                            "(V?)MOVAPD(Y?)mr",
+                                            "(V?)MOVAPS(Y?)mr",
+                                            "(V?)MOVDQA(Y?)mr",
+                                            "(V?)MOVDQU(Y?)mr",
+                                            "(V?)MOVHPDmr",
+                                            "(V?)MOVHPSmr",
+                                            "(V?)MOVLPDmr",
+                                            "(V?)MOVLPSmr",
+                                            "(V?)MOVNTDQ(Y?)mr",
+                                            "(V?)MOVNTPD(Y?)mr",
+                                            "(V?)MOVNTPS(Y?)mr",
+                                            "(V?)MOVPDI2DImr",
+                                            "(V?)MOVPQI2QImr",
+                                            "(V?)MOVPQIto64mr",
+                                            "(V?)MOVSDmr",
+                                            "(V?)MOVSSmr",
+                                            "(V?)MOVUPD(Y?)mr",
+                                            "(V?)MOVUPS(Y?)mr")>;
 
 def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> {
   let Latency = 7;
@@ -976,7 +998,7 @@ def: InstRW<[SBWriteResGroup37], (instre
                                             "VMASKMOVPS(Y?)mr")>;
 
 def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
-  let Latency = 2;
+  let Latency = 5;
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
@@ -996,7 +1018,8 @@ def SBWriteResGroup40 : SchedWriteRes<[S
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup40], (instregex "STOSB",
+def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi",
+                                            "STOSB",
                                             "STOSL",
                                             "STOSQ",
                                             "STOSW")>;
@@ -1016,7 +1039,7 @@ def SBWriteResGroup42 : SchedWriteRes<[S
 def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(8|16|32|64)rr")>;
 
 def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
-  let Latency = 3;
+  let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [1,1,2];
 }

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td Wed Apr  4 22:19:36 2018
@@ -667,6 +667,8 @@ def: InstRW<[SKLWriteResGroup11], (instr
                                              "MMX_MOVD64mr",
                                              "MMX_MOVNTQmr",
                                              "MMX_MOVQ64mr",
+                                             "MOV(8|16|32|64)mr",
+                                             "MOV8mi",
                                              "MOVNTI_64mr",
                                              "MOVNTImr",
                                              "ST_FP32m",

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td Wed Apr  4 22:19:36 2018
@@ -1331,6 +1331,9 @@ def: InstRW<[SKXWriteResGroup11], (instr
                                              "MMX_MOVD64mr",
                                              "MMX_MOVNTQmr",
                                              "MMX_MOVQ64mr",
+                                             "MOV(16|32|64)mr",
+                                             "MOV8mi",
+                                             "MOV8mr",
                                              "MOVAPDmr",
                                              "MOVAPSmr",
                                              "MOVDQAmr",

Modified: llvm/trunk/test/CodeGen/X86/avx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll Wed Apr  4 22:19:36 2018
@@ -1662,14 +1662,14 @@ define <4 x float> @test_extractf128(<8
 ; GENERIC-LABEL: test_extractf128:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_extractf128:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2526,14 +2526,14 @@ define <4 x double> @test_movapd(<4 x do
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movapd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovapd (%rdi), %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovapd %ymm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movapd:
@@ -2588,14 +2588,14 @@ define <8 x float> @test_movaps(<8 x flo
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movaps:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovaps (%rdi), %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovaps %ymm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movaps:
@@ -2816,7 +2816,7 @@ define void @test_movntdq(<4 x i64> %a0,
 ; GENERIC-LABEL: test_movntdq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2824,7 +2824,7 @@ define void @test_movntdq(<4 x i64> %a0,
 ; SANDY-LABEL: test_movntdq:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    #APP
-; SANDY-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    #NO_APP
 ; SANDY-NEXT:    vzeroupper # sched: [100:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
@@ -2883,13 +2883,13 @@ define <4 x double> @test_movntpd(<4 x d
 ; GENERIC-LABEL: test_movntpd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntpd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntpd:
@@ -2936,13 +2936,13 @@ define <8 x float> @test_movntps(<8 x fl
 ; GENERIC-LABEL: test_movntps:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovntps %ymm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntps:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntps %ymm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntps:
@@ -3116,7 +3116,7 @@ define <4 x double> @test_movupd(<4 x do
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovupd (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovupd %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movupd:
@@ -3124,8 +3124,8 @@ define <4 x double> @test_movupd(<4 x do
 ; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movupd:
@@ -3180,7 +3180,7 @@ define <8 x float> @test_movups(<8 x flo
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovups (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovups %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movups:
@@ -3188,8 +3188,8 @@ define <8 x float> @test_movups(<8 x flo
 ; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movups:

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Wed Apr  4 22:19:36 2018
@@ -949,7 +949,7 @@ define <8 x double> @test_mask_broadcast
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_broadcast_vaddpd:
@@ -1937,7 +1937,7 @@ define void @f32tof64_loadstore() {
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
 ; GENERIC-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f32tof64_loadstore:
@@ -1960,7 +1960,7 @@ define void @f64tof32_loadstore() nounwi
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
 ; GENERIC-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; GENERIC-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: f64tof32_loadstore:
@@ -2040,7 +2040,7 @@ define <16 x double> @uito16f64(<16 x i3
 ; GENERIC-NEXT:    vcvtudq2pd %ymm0, %zmm2 # sched: [4:1.00]
 ; GENERIC-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcvtudq2pd %ymm0, %zmm1 # sched: [4:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: uito16f64:
@@ -2748,7 +2748,7 @@ define <16 x double> @sito16f64(<16 x i3
 ; GENERIC-NEXT:    vcvtdq2pd %ymm0, %zmm2 # sched: [4:1.00]
 ; GENERIC-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcvtdq2pd %ymm0, %zmm1 # sched: [4:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sito16f64:
@@ -4495,9 +4495,9 @@ define <64 x i16> @test21(<64 x i16> %x
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %zmm2, %zmm2 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    kshiftrq $32, %k1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test21:
@@ -4641,7 +4641,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<6
 ; GENERIC-LABEL: zext_64xi1_to_64xi8:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_64xi1_to_64xi8:
@@ -4695,7 +4695,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<3
 ; GENERIC-LABEL: zext_32xi1_to_32xi8:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_32xi1_to_32xi8:
@@ -4981,7 +4981,7 @@ define <16 x float> @test_x86_fmadd231_p
 ; GENERIC-NEXT:    vpmovb2m %xmm2, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmulps (%rdi), %zmm0, %zmm0 # sched: [9:1.00]
 ; GENERIC-NEXT:    vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_x86_fmadd231_ps:
@@ -5007,7 +5007,7 @@ define <16 x float> @test_x86_fmadd213_p
 ; GENERIC-NEXT:    vpmovb2m %xmm2, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_x86_fmadd213_ps:
@@ -5934,7 +5934,7 @@ define <4 x i32> @mov_test4(i32* %x) {
 define void @mov_test5(float %x, float* %y) {
 ; GENERIC-LABEL: mov_test5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovss %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovss %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test5:
@@ -5948,7 +5948,7 @@ define void @mov_test5(float %x, float*
 define void @mov_test6(double %x, double* %y) {
 ; GENERIC-LABEL: mov_test6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovsd %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vmovsd %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test6:
@@ -6093,7 +6093,7 @@ define <4 x i32> @mov_test15(i32* %x) {
 define <16 x i32> @mov_test16(i8 * %addr) {
 ; GENERIC-LABEL: mov_test16:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test16:
@@ -6108,7 +6108,7 @@ define <16 x i32> @mov_test16(i8 * %addr
 define <16 x i32> @mov_test17(i8 * %addr) {
 ; GENERIC-LABEL: mov_test17:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test17:
@@ -6174,7 +6174,7 @@ define void @mov_test20(i8 * %addr, <16
 define  <8 x i64> @mov_test21(i8 * %addr) {
 ; GENERIC-LABEL: mov_test21:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test21:
@@ -6206,7 +6206,7 @@ define void @mov_test22(i8 * %addr, <8 x
 define <8 x i64> @mov_test23(i8 * %addr) {
 ; GENERIC-LABEL: mov_test23:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test23:
@@ -6238,7 +6238,7 @@ define void @mov_test24(i8 * %addr, <8 x
 define <8 x double> @mov_test25(i8 * %addr) {
 ; GENERIC-LABEL: mov_test25:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test25:
@@ -6270,7 +6270,7 @@ define void @mov_test26(i8 * %addr, <16
 define <16 x float> @mov_test27(i8 * %addr) {
 ; GENERIC-LABEL: mov_test27:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test27:
@@ -6302,7 +6302,7 @@ define void @mov_test28(i8 * %addr, <8 x
 define <8 x double> @mov_test29(i8 * %addr) {
 ; GENERIC-LABEL: mov_test29:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test29:
@@ -6334,7 +6334,7 @@ define void @mov_test30(i8 * %addr, <16
 define <16 x float> @mov_test31(i8 * %addr) {
 ; GENERIC-LABEL: mov_test31:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test31:
@@ -6350,7 +6350,7 @@ define <16 x i32> @mov_test32(i8 * %addr
 ; GENERIC-LABEL: mov_test32:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test32:
@@ -6369,7 +6369,7 @@ define <16 x i32> @mov_test33(i8 * %addr
 ; GENERIC-LABEL: mov_test33:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test33:
@@ -6388,7 +6388,7 @@ define <16 x i32> @mov_test34(i8 * %addr
 ; GENERIC-LABEL: mov_test34:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test34:
@@ -6407,7 +6407,7 @@ define <16 x i32> @mov_test35(i8 * %addr
 ; GENERIC-LABEL: mov_test35:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test35:
@@ -6426,7 +6426,7 @@ define <8 x i64> @mov_test36(i8 * %addr,
 ; GENERIC-LABEL: mov_test36:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test36:
@@ -6445,7 +6445,7 @@ define <8 x i64> @mov_test37(i8 * %addr,
 ; GENERIC-LABEL: mov_test37:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test37:
@@ -6464,7 +6464,7 @@ define <8 x i64> @mov_test38(i8 * %addr,
 ; GENERIC-LABEL: mov_test38:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test38:
@@ -6483,7 +6483,7 @@ define <8 x i64> @mov_test39(i8 * %addr,
 ; GENERIC-LABEL: mov_test39:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test39:
@@ -6503,7 +6503,7 @@ define <16 x float> @mov_test40(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test40:
@@ -6524,7 +6524,7 @@ define <16 x float> @mov_test41(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test41:
@@ -6545,7 +6545,7 @@ define <16 x float> @mov_test42(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test42:
@@ -6566,7 +6566,7 @@ define <16 x float> @mov_test43(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test43:
@@ -6587,7 +6587,7 @@ define <8 x double> @mov_test44(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test44:
@@ -6608,7 +6608,7 @@ define <8 x double> @mov_test45(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovupd (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovupd (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test45:
@@ -6629,7 +6629,7 @@ define <8 x double> @mov_test46(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test46:
@@ -6650,7 +6650,7 @@ define <8 x double> @mov_test47(i8 * %ad
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test47:
@@ -6943,8 +6943,8 @@ define i8 @zext_test3(<16 x i32> %a, <16
 define i8 @conv1(<8 x i1>* %R) {
 ; GENERIC-LABEL: conv1:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movb $-1, (%rdi) # sched: [1:1.00]
-; GENERIC-NEXT:    movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; GENERIC-NEXT:    movb $-1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT:    movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; GENERIC-NEXT:    movb $-2, %al # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7340,7 +7340,7 @@ define <32 x i16> @vmov_test21(<32 x i16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vmov_test21:
@@ -7512,7 +7512,7 @@ define void @f1(i32 %c) {
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    movzbl {{.*}}(%rip), %edi # sched: [5:0.50]
 ; GENERIC-NEXT:    xorl $1, %edi # sched: [1:0.33]
-; GENERIC-NEXT:    movb %dil, {{.*}}(%rip) # sched: [1:1.00]
+; GENERIC-NEXT:    movb %dil, {{.*}}(%rip) # sched: [5:1.00]
 ; GENERIC-NEXT:    jmp f2 # TAILCALL
 ;
 ; SKX-LABEL: f1:
@@ -7536,7 +7536,7 @@ define void @store_i16_i1(i16 %x, i1 *%y
 ; GENERIC-LABEL: store_i16_i1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    andl $1, %edi # sched: [1:0.33]
-; GENERIC-NEXT:    movb %dil, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movb %dil, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: store_i16_i1:
@@ -7553,7 +7553,7 @@ define void @store_i8_i1(i8 %x, i1 *%y)
 ; GENERIC-LABEL: store_i8_i1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    andl $1, %edi # sched: [1:0.33]
-; GENERIC-NEXT:    movb %dil, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movb %dil, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: store_i8_i1:
@@ -7572,7 +7572,7 @@ define <32 x i16> @test_build_vec_v32i1(
 ; GENERIC-NEXT:    movl $1497715861, %eax # imm = 0x59455495
 ; GENERIC-NEXT:    # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_build_vec_v32i1:
@@ -7603,9 +7603,9 @@ define <64 x i8> @test_build_vec_v64i1(<
 define void @ktest_1(<8 x double> %in, double * %base) {
 ; GENERIC-LABEL: ktest_1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovupd (%rdi), %zmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovupd (%rdi), %zmm1 # sched: [4:0.50]
 ; GENERIC-NEXT:    vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    kortestb %k0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    je .LBB410_2 # sched: [1:1.00]
@@ -7665,13 +7665,13 @@ define void @ktest_2(<32 x float> %in, f
 ;
 ; GENERIC-LABEL: ktest_2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovups (%rdi), %zmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vmovups 64(%rdi), %zmm3 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT:    vmovups 64(%rdi), %zmm3 # sched: [4:0.50]
 ; GENERIC-NEXT:    vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00]
 ; GENERIC-NEXT:    kunpckwd %k1, %k2, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [6:0.50]
-; GENERIC-NEXT:    vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [6:0.50]
+; GENERIC-NEXT:    vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [4:0.50]
+; GENERIC-NEXT:    vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00]
 ; GENERIC-NEXT:    kunpckwd %k1, %k2, %k1 # sched: [1:1.00]
@@ -8220,7 +8220,7 @@ define   <16 x float> @_ss16xfloat_mask(
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_mask:
@@ -8331,7 +8331,7 @@ define   <8 x double> @_sd8xdouble_mask(
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_mask:
@@ -8457,7 +8457,7 @@ define <16 x i32> @test_vbroadcast() {
 ; GENERIC-NEXT:    vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpmovm2d %k0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    knotw %k0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_vbroadcast:
@@ -8698,7 +8698,7 @@ define <16 x float> @broadcast_ss_spill(
 ; GENERIC-NEXT:    subq $24, %rsp # sched: [1:0.33]
 ; GENERIC-NEXT:    .cfi_def_cfa_offset 32
 ; GENERIC-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00]
 ; GENERIC-NEXT:    callq func_f32
 ; GENERIC-NEXT:    vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00]
 ; GENERIC-NEXT:    addq $24, %rsp # sched: [1:0.33]
@@ -8728,7 +8728,7 @@ define <8 x double> @broadcast_sd_spill(
 ; GENERIC-NEXT:    subq $24, %rsp # sched: [1:0.33]
 ; GENERIC-NEXT:    .cfi_def_cfa_offset 32
 ; GENERIC-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00]
 ; GENERIC-NEXT:    callq func_f64
 ; GENERIC-NEXT:    vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00]
 ; GENERIC-NEXT:    addq $24, %rsp # sched: [1:0.33]

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Wed Apr  4 22:19:36 2018
@@ -401,7 +401,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
 ; GENERIC-LABEL: test_32xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -416,10 +416,10 @@ define <32 x i16> @test_32xi16_perm_mask
 define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask0:
@@ -438,7 +438,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -457,10 +457,10 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask1:
@@ -479,7 +479,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -498,10 +498,10 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask2:
@@ -520,7 +520,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -539,7 +539,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
 ; GENERIC-LABEL: test_32xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -554,10 +554,10 @@ define <32 x i16> @test_32xi16_perm_mask
 define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mask3:
@@ -576,7 +576,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -595,7 +595,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
 ; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -611,7 +611,7 @@ define <32 x i16> @test_32xi16_perm_mem_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -632,7 +632,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -653,7 +653,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -674,7 +674,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -695,7 +695,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -716,7 +716,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -737,7 +737,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
 ; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -753,7 +753,7 @@ define <32 x i16> @test_32xi16_perm_mem_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -774,7 +774,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1189,7 +1189,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1204,10 +1204,10 @@ define <16 x i32> @test_16xi32_perm_mask
 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask0:
@@ -1226,7 +1226,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1245,10 +1245,10 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask1:
@@ -1267,7 +1267,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1286,10 +1286,10 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask2:
@@ -1308,7 +1308,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1327,7 +1327,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
 ; GENERIC-LABEL: test_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1342,10 +1342,10 @@ define <16 x i32> @test_16xi32_perm_mask
 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mask3:
@@ -1364,7 +1364,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1383,7 +1383,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
 ; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1399,7 +1399,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1420,7 +1420,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1441,7 +1441,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1462,7 +1462,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1483,7 +1483,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1504,7 +1504,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1525,7 +1525,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1541,7 +1541,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1562,7 +1562,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1937,7 +1937,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
 ; GENERIC-LABEL: test_8xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1952,10 +1952,10 @@ define <8 x i64> @test_8xi64_perm_mask0(
 define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask0:
@@ -1974,7 +1974,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1995,7 +1995,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
@@ -2030,10 +2030,10 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask2:
@@ -2052,7 +2052,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2086,7 +2086,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
@@ -2121,10 +2121,10 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask4:
@@ -2143,7 +2143,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2164,7 +2164,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
@@ -2199,7 +2199,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
 ; GENERIC-LABEL: test_8xi64_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2214,10 +2214,10 @@ define <8 x i64> @test_8xi64_perm_mask6(
 define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mask6:
@@ -2236,7 +2236,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2257,7 +2257,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
@@ -2292,7 +2292,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
 ; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2308,7 +2308,7 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2329,7 +2329,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2388,7 +2388,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2409,7 +2409,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2482,7 +2482,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2503,7 +2503,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2562,7 +2562,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
 ; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2578,7 +2578,7 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2599,7 +2599,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3052,7 +3052,7 @@ define <8 x float> @test_masked_z_8xfloa
 define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
 ; GENERIC-LABEL: test_16xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -3067,10 +3067,10 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask0:
@@ -3089,7 +3089,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3108,10 +3108,10 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask1:
@@ -3130,7 +3130,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3149,10 +3149,10 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask2:
@@ -3171,7 +3171,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3190,7 +3190,7 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
 ; GENERIC-LABEL: test_16xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -3205,10 +3205,10 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mask3:
@@ -3227,7 +3227,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3246,7 +3246,7 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -3262,7 +3262,7 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3283,7 +3283,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3304,7 +3304,7 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3325,7 +3325,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3346,7 +3346,7 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3367,7 +3367,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3388,7 +3388,7 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -3404,7 +3404,7 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3425,7 +3425,7 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3800,7 +3800,7 @@ define <4 x double> @test_masked_z_4xdou
 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
 ; GENERIC-LABEL: test_8xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -3815,10 +3815,10 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask0:
@@ -3837,7 +3837,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3858,7 +3858,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
@@ -3893,10 +3893,10 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask2:
@@ -3915,7 +3915,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3949,7 +3949,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
@@ -3984,10 +3984,10 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask4:
@@ -4006,7 +4006,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4027,7 +4027,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
@@ -4062,7 +4062,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
 ; GENERIC-LABEL: test_8xdouble_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4077,10 +4077,10 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mask6:
@@ -4099,7 +4099,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4120,7 +4120,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
@@ -4155,7 +4155,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4171,7 +4171,7 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4192,7 +4192,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4251,7 +4251,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4272,7 +4272,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4345,7 +4345,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4366,7 +4366,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4425,7 +4425,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4441,7 +4441,7 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4462,7 +4462,7 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5284,7 +5284,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask0:
@@ -5321,7 +5321,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask1:
@@ -5358,7 +5358,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask2:
@@ -5408,7 +5408,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask3:
@@ -5443,7 +5443,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
 ; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5459,7 +5459,7 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5480,7 +5480,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5501,7 +5501,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5522,7 +5522,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5543,7 +5543,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5564,7 +5564,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5585,7 +5585,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
 ; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5601,7 +5601,7 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -5622,7 +5622,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -7020,7 +7020,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
@@ -7057,7 +7057,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
@@ -7094,7 +7094,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
@@ -7144,7 +7144,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
@@ -7181,7 +7181,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
@@ -7218,7 +7218,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
@@ -7268,7 +7268,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
@@ -7305,7 +7305,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
@@ -7560,7 +7560,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
@@ -7581,7 +7581,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
@@ -8413,7 +8413,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask0:
@@ -8450,7 +8450,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask1:
@@ -8487,7 +8487,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask2:
@@ -8537,7 +8537,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask3:
@@ -9129,7 +9129,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
@@ -9166,7 +9166,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
@@ -9203,7 +9203,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
@@ -9253,7 +9253,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
@@ -9304,7 +9304,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
@@ -9344,7 +9344,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
@@ -9384,7 +9384,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
@@ -9438,7 +9438,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
@@ -9853,7 +9853,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
@@ -9890,7 +9890,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
@@ -9927,7 +9927,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
@@ -9977,7 +9977,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
@@ -10028,7 +10028,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
@@ -10068,7 +10068,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
@@ -10108,7 +10108,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
@@ -10162,7 +10162,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
@@ -10577,7 +10577,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask0:
@@ -10614,7 +10614,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask1:
@@ -10651,7 +10651,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask2:
@@ -10701,7 +10701,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask3:
@@ -10752,7 +10752,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
@@ -10792,7 +10792,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
@@ -10832,7 +10832,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
@@ -10886,7 +10886,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
@@ -11301,7 +11301,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask0:
@@ -11338,7 +11338,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask1:
@@ -11375,7 +11375,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask2:
@@ -11425,7 +11425,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask3:
@@ -11476,7 +11476,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
@@ -11516,7 +11516,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
@@ -11556,7 +11556,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
@@ -11610,7 +11610,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
@@ -12387,7 +12387,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
@@ -12424,7 +12424,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
@@ -12461,7 +12461,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
@@ -12511,7 +12511,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
@@ -12562,7 +12562,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
@@ -12602,7 +12602,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
@@ -12642,7 +12642,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
@@ -12696,7 +12696,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
@@ -13292,7 +13292,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
@@ -13329,7 +13329,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
@@ -13366,7 +13366,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
@@ -13416,7 +13416,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
@@ -13467,7 +13467,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
@@ -13507,7 +13507,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
@@ -13547,7 +13547,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
@@ -13601,7 +13601,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
@@ -14378,7 +14378,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
@@ -14415,7 +14415,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
@@ -14452,7 +14452,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
@@ -14502,7 +14502,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
@@ -14553,7 +14553,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
@@ -14593,7 +14593,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
@@ -14633,7 +14633,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
@@ -14687,7 +14687,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
@@ -15283,7 +15283,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
@@ -15320,7 +15320,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
@@ -15357,7 +15357,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
@@ -15407,7 +15407,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
@@ -15458,7 +15458,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
@@ -15498,7 +15498,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
@@ -15538,7 +15538,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
@@ -15592,7 +15592,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:

Modified: llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll Wed Apr  4 22:19:36 2018
@@ -15,22 +15,23 @@ define void @test_extractelement_legaliz
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    paddd (%ecx), %xmm0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    paddd (%edx), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%edx)
-; CHECK-NEXT:    movl (%edx), %esi
-; CHECK-NEXT:    movl 4(%edx), %edi
-; CHECK-NEXT:    shll $4, %ecx
-; CHECK-NEXT:    movl 8(%edx), %ebx
-; CHECK-NEXT:    movl 12(%edx), %edx
-; CHECK-NEXT:    movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT:    movl %edi, (%eax,%ecx)
-; CHECK-NEXT:    movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT:    movl %edx, 4(%eax,%ecx)
+; CHECK-NEXT:    movdqa %xmm0, (%ecx)
+; CHECK-NEXT:    movl (%ecx), %esi
+; CHECK-NEXT:    movl 4(%ecx), %edi
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    movl 8(%ecx), %ebx
+; CHECK-NEXT:    movl 12(%ecx), %ecx
+; CHECK-NEXT:    movl %esi, 12(%eax,%edx)
+; CHECK-NEXT:    movl %edi, (%eax,%edx)
+; CHECK-NEXT:    movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT:    movl %ecx, 4(%eax,%edx)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    retl
+; CHECK-NEXT:    ## -- End function
 entry:
   %0 = bitcast i32* %y to <4 x i32>*
   %1 = load <4 x i32>, <4 x i32>* %0, align 16

Modified: llvm/trunk/test/CodeGen/X86/fp128-i128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp128-i128.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fp128-i128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fp128-i128.ll Wed Apr  4 22:19:36 2018
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64
 ; CHECK-NEXT:    andq %rdi, %rcx
 ; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
 ; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    jmp foo # TAILCALL
@@ -105,11 +105,11 @@ define fp128 @TestI128_1(fp128 %x) #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    andq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rcx, (%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    andq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rax, (%rsp)
 ; CHECK-NEXT:    movaps (%rsp), %xmm0
 ; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm1
 ; CHECK-NEXT:    callq __lttf2
@@ -336,11 +336,11 @@ define void @TestCopySign({ fp128, fp128
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    callq __gttf2
 ; CHECK-NEXT:    movl %eax, %ebp
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    callq __subtf3
 ; CHECK-NEXT:    testl %ebp, %ebp
@@ -355,8 +355,8 @@ define void @TestCopySign({ fp128, fp128
 ; CHECK-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
 ; CHECK-NEXT:  .LBB10_3: # %cleanup
 ; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    movaps %xmm2, (%rbx)
 ; CHECK-NEXT:    movaps %xmm0, 16(%rbx)

Modified: llvm/trunk/test/CodeGen/X86/memcpy-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memcpy-2.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/memcpy-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/memcpy-2.ll Wed Apr  4 22:19:36 2018
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2-Darwin
 ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2-Mingw32
 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
@@ -11,62 +10,42 @@
 @.str2 = internal constant [30 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 4
 
 define void @t1(i32 %argc, i8** %argv) nounwind  {
-; SSE2-Darwin-LABEL: t1:
-; SSE2-Darwin:       ## %bb.0: ## %entry
-; SSE2-Darwin-NEXT:    subl $28, %esp
-; SSE2-Darwin-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-Darwin-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
-; SSE2-Darwin-NEXT:    movaps _.str, %xmm0
-; SSE2-Darwin-NEXT:    movaps %xmm0, (%esp)
-; SSE2-Darwin-NEXT:    movb $0, {{[0-9]+}}(%esp)
-;
-; SSE2-Mingw32-LABEL: t1:
-; SSE2-Mingw32:       # %bb.0: # %entry
-; SSE2-Mingw32-NEXT:    subl $28, %esp
-; SSE2-Mingw32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-Mingw32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
-; SSE2-Mingw32-NEXT:    movaps _.str, %xmm0
-; SSE2-Mingw32-NEXT:    movups %xmm0, (%esp)
-; SSE2-Mingw32-NEXT:    movb $0, {{[0-9]+}}(%esp)
-;
-; SSE1-LABEL: t1:
-; SSE1:       ## %bb.0: ## %entry
-; SSE1-NEXT:    subl $28, %esp
-; SSE1-NEXT:    movaps _.str, %xmm0
-; SSE1-NEXT:    movaps %xmm0, (%esp)
-; SSE1-NEXT:    movb $0, {{[0-9]+}}(%esp)
-; SSE1-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; SSE1-NEXT:    movl $0, {{[0-9]+}}(%esp)
-;
-; NOSSE-LABEL: t1:
-; NOSSE:       ## %bb.0: ## %entry
-; NOSSE-NEXT:    subl $28, %esp
-; NOSSE-NEXT:    movb $0, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $101, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $1734438249, (%esp) ## imm = 0x67616D69
-;
-; X86-64-LABEL: t1:
-; X86-64:       ## %bb.0: ## %entry
-; X86-64-NEXT:    movaps {{.*}}(%rip), %xmm0
-; X86-64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-;
-; NHM_64-LABEL: t1:
-; NHM_64:       ## %bb.0: ## %entry
-; NHM_64-NEXT:    movups _.str+{{.*}}(%rip), %xmm0
-; NHM_64-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; NHM_64-NEXT:    movaps {{.*}}(%rip), %xmm0
-; NHM_64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 entry:
+; SSE2-Darwin-LABEL: t1:
+; SSE2-Darwin: movsd _.str+16, %xmm0
+; SSE2-Darwin: movsd %xmm0, 16(%esp)
+; SSE2-Darwin: movaps _.str, %xmm0
+; SSE2-Darwin: movaps %xmm0
+; SSE2-Darwin: movb $0, 24(%esp)
 
+; SSE2-Mingw32-LABEL: t1:
+; SSE2-Mingw32: movsd _.str+16, %xmm0
+; SSE2-Mingw32: movsd %xmm0, 16(%esp)
+; SSE2-Mingw32: movaps _.str, %xmm0
+; SSE2-Mingw32: movups %xmm0
+; SSE2-Mingw32: movb $0, 24(%esp)
 
+; SSE1-LABEL: t1:
+; SSE1: movaps _.str, %xmm0
+; SSE1: movb $0, 24(%esp)
+; SSE1: movaps %xmm0
+; SSE1: movl $0, 20(%esp)
+; SSE1: movl $0, 16(%esp)
 
+; NOSSE-LABEL: t1:
+; NOSSE: movb $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $101
+; NOSSE: movl $1734438249
 
+; X86-64-LABEL: t1:
+; X86-64: movaps _.str(%rip), %xmm0
+; X86-64: movaps %xmm0
+; X86-64: movb $0
+; X86-64: movq $0
   %tmp1 = alloca [25 x i8]
   %tmp2 = bitcast [25 x i8]* %tmp1 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp2, i8* align 1 getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i32 0, i32 0), i32 25, i1 false)
@@ -77,60 +56,34 @@ entry:
 %struct.s0 = type { [2 x double] }
 
 define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
-; SSE2-Darwin-LABEL: t2:
-; SSE2-Darwin:       ## %bb.0: ## %entry
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Darwin-NEXT:    movaps (%ecx), %xmm0
-; SSE2-Darwin-NEXT:    movaps %xmm0, (%eax)
-; SSE2-Darwin-NEXT:    retl
-;
-; SSE2-Mingw32-LABEL: t2:
-; SSE2-Mingw32:       # %bb.0: # %entry
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Mingw32-NEXT:    movaps (%ecx), %xmm0
-; SSE2-Mingw32-NEXT:    movaps %xmm0, (%eax)
-; SSE2-Mingw32-NEXT:    retl
-;
-; SSE1-LABEL: t2:
-; SSE1:       ## %bb.0: ## %entry
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE1-NEXT:    movaps (%ecx), %xmm0
-; SSE1-NEXT:    movaps %xmm0, (%eax)
-; SSE1-NEXT:    retl
-;
-; NOSSE-LABEL: t2:
-; NOSSE:       ## %bb.0: ## %entry
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movl 12(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 12(%eax)
-; NOSSE-NEXT:    movl 8(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 8(%eax)
-; NOSSE-NEXT:    movl (%ecx), %edx
-; NOSSE-NEXT:    movl 4(%ecx), %ecx
-; NOSSE-NEXT:    movl %ecx, 4(%eax)
-; NOSSE-NEXT:    movl %edx, (%eax)
-; NOSSE-NEXT:    retl
-;
-; X86-64-LABEL: t2:
-; X86-64:       ## %bb.0: ## %entry
-; X86-64-NEXT:    movaps (%rsi), %xmm0
-; X86-64-NEXT:    movaps %xmm0, (%rdi)
-; X86-64-NEXT:    retq
-;
-; NHM_64-LABEL: t2:
-; NHM_64:       ## %bb.0: ## %entry
-; NHM_64-NEXT:    movaps (%rsi), %xmm0
-; NHM_64-NEXT:    movaps %xmm0, (%rdi)
-; NHM_64-NEXT:    retq
 entry:
+; SSE2-Darwin-LABEL: t2:
+; SSE2-Darwin: movaps (%ecx), %xmm0
+; SSE2-Darwin: movaps %xmm0, (%eax)
 
+; SSE2-Mingw32-LABEL: t2:
+; SSE2-Mingw32: movaps (%ecx), %xmm0
+; SSE2-Mingw32: movaps %xmm0, (%eax)
 
+; SSE1-LABEL: t2:
+; SSE1: movaps (%ecx), %xmm0
+; SSE1: movaps %xmm0, (%eax)
 
+; NOSSE-LABEL: t2:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
 
+; X86-64-LABEL: t2:
+; X86-64: movaps (%rsi), %xmm0
+; X86-64: movaps %xmm0, (%rdi)
   %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
   %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %tmp2, i8* align 16 %tmp3, i32 16, i1 false)
@@ -138,72 +91,48 @@ entry:
 }
 
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
-; SSE2-Darwin-LABEL: t3:
-; SSE2-Darwin:       ## %bb.0: ## %entry
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Darwin-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-Darwin-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-Darwin-NEXT:    movsd %xmm1, 8(%eax)
-; SSE2-Darwin-NEXT:    movsd %xmm0, (%eax)
-; SSE2-Darwin-NEXT:    retl
-;
-; SSE2-Mingw32-LABEL: t3:
-; SSE2-Mingw32:       # %bb.0: # %entry
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Mingw32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-Mingw32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-Mingw32-NEXT:    movsd %xmm1, 8(%eax)
-; SSE2-Mingw32-NEXT:    movsd %xmm0, (%eax)
-; SSE2-Mingw32-NEXT:    retl
-;
-; SSE1-LABEL: t3:
-; SSE1:       ## %bb.0: ## %entry
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE1-NEXT:    movl 12(%ecx), %edx
-; SSE1-NEXT:    movl %edx, 12(%eax)
-; SSE1-NEXT:    movl 8(%ecx), %edx
-; SSE1-NEXT:    movl %edx, 8(%eax)
-; SSE1-NEXT:    movl (%ecx), %edx
-; SSE1-NEXT:    movl 4(%ecx), %ecx
-; SSE1-NEXT:    movl %ecx, 4(%eax)
-; SSE1-NEXT:    movl %edx, (%eax)
-; SSE1-NEXT:    retl
-;
-; NOSSE-LABEL: t3:
-; NOSSE:       ## %bb.0: ## %entry
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movl 12(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 12(%eax)
-; NOSSE-NEXT:    movl 8(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 8(%eax)
-; NOSSE-NEXT:    movl (%ecx), %edx
-; NOSSE-NEXT:    movl 4(%ecx), %ecx
-; NOSSE-NEXT:    movl %ecx, 4(%eax)
-; NOSSE-NEXT:    movl %edx, (%eax)
-; NOSSE-NEXT:    retl
-;
-; X86-64-LABEL: t3:
-; X86-64:       ## %bb.0: ## %entry
-; X86-64-NEXT:    movq (%rsi), %rax
-; X86-64-NEXT:    movq 8(%rsi), %rcx
-; X86-64-NEXT:    movq %rcx, 8(%rdi)
-; X86-64-NEXT:    movq %rax, (%rdi)
-; X86-64-NEXT:    retq
-;
-; NHM_64-LABEL: t3:
-; NHM_64:       ## %bb.0: ## %entry
-; NHM_64-NEXT:    movups (%rsi), %xmm0
-; NHM_64-NEXT:    movups %xmm0, (%rdi)
-; NHM_64-NEXT:    retq
 entry:
+; SSE2-Darwin-LABEL: t3:
+; SSE2-Darwin: movsd (%ecx), %xmm0
+; SSE2-Darwin: movsd 8(%ecx), %xmm1
+; SSE2-Darwin: movsd %xmm1, 8(%eax)
+; SSE2-Darwin: movsd %xmm0, (%eax)
 
+; SSE2-Mingw32-LABEL: t3:
+; SSE2-Mingw32: movsd (%ecx), %xmm0
+; SSE2-Mingw32: movsd 8(%ecx), %xmm1
+; SSE2-Mingw32: movsd %xmm1, 8(%eax)
+; SSE2-Mingw32: movsd %xmm0, (%eax)
 
+; SSE1-LABEL: t3:
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
 
+; NOSSE-LABEL: t3:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
 
+; X86-64-LABEL: t3:
+; X86-64: movq (%rsi), %rax
+; X86-64: movq 8(%rsi), %rcx
+; X86-64: movq %rcx, 8(%rdi)
+; X86-64: movq %rax, (%rdi)
   %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
   %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp2, i8* align 8 %tmp3, i32 16, i1 false)
@@ -211,80 +140,65 @@ entry:
 }
 
 define void @t4() nounwind {
-; SSE2-Darwin-LABEL: t4:
-; SSE2-Darwin:       ## %bb.0: ## %entry
-; SSE2-Darwin-NEXT:    subl $32, %esp
-; SSE2-Darwin-NEXT:    movw $120, {{[0-9]+}}(%esp)
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE2-Darwin-NEXT:    movl $2021161080, (%esp) ## imm = 0x78787878
-;
-; SSE2-Mingw32-LABEL: t4:
-; SSE2-Mingw32:       # %bb.0: # %entry
-; SSE2-Mingw32-NEXT:    subl $32, %esp
-; SSE2-Mingw32-NEXT:    movw $120, {{[0-9]+}}(%esp)
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) # imm = 0x78787878
-; SSE2-Mingw32-NEXT:    movl $2021161080, (%esp) # imm = 0x78787878
-;
-; SSE1-LABEL: t4:
-; SSE1:       ## %bb.0: ## %entry
-; SSE1-NEXT:    subl $32, %esp
-; SSE1-NEXT:    movw $120, {{[0-9]+}}(%esp)
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; SSE1-NEXT:    movl $2021161080, (%esp) ## imm = 0x78787878
-;
-; NOSSE-LABEL: t4:
-; NOSSE:       ## %bb.0: ## %entry
-; NOSSE-NEXT:    subl $32, %esp
-; NOSSE-NEXT:    movw $120, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, {{[0-9]+}}(%esp) ## imm = 0x78787878
-; NOSSE-NEXT:    movl $2021161080, (%esp) ## imm = 0x78787878
-;
-; X86-64-LABEL: t4:
-; X86-64:       ## %bb.0: ## %entry
-; X86-64-NEXT:    movabsq $33909456017848440, %rax ## imm = 0x78787878787878
-; X86-64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movabsq $8680820740569200760, %rax ## imm = 0x7878787878787878
-; X86-64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-;
-; NHM_64-LABEL: t4:
-; NHM_64:       ## %bb.0: ## %entry
-; NHM_64-NEXT:    movups _.str2+{{.*}}(%rip), %xmm0
-; NHM_64-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; NHM_64-NEXT:    movups {{.*}}(%rip), %xmm0
-; NHM_64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 entry:
+; SSE2-Darwin-LABEL: t4:
+; SSE2-Darwin: movw $120
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
 
+; SSE2-Mingw32-LABEL: t4:
+; SSE2-Mingw32: movw $120
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
 
+; SSE1-LABEL: t4:
+; SSE1: movw $120
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
+; SSE1: movl $2021161080
 
+; NOSSE-LABEL: t4:
+; NOSSE: movw $120
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
+; NOSSE: movl $2021161080
 
 ;;; TODO: (1) Some of the loads and stores are certainly unaligned and (2) the first load and first
 ;;; store overlap with the second load and second store respectively.
 ;;;
 ;;; Is either of the sequences ideal?
 
+; X86-64-LABEL: t4:
+; X86-64: movabsq  $33909456017848440, %rax ## imm = 0x78787878787878
+; X86-64: movq     %rax, -10(%rsp)
+; X86-64: movabsq  $8680820740569200760, %rax ## imm = 0x7878787878787878
+; X86-64: movq     %rax, -16(%rsp)
+; X86-64: movq     %rax, -24(%rsp)
+; X86-64: movq     %rax, -32(%rsp)
 
+; NHM_64-LABEL: t4:
+; NHM_64: movups   _.str2+14(%rip), %xmm0
+; NHM_64: movups   %xmm0, -26(%rsp)
+; NHM_64: movups   _.str2(%rip), %xmm0
+; NHM_64: movaps   %xmm0, -40(%rsp)
 
   %tmp1 = alloca [30 x i8]
   %tmp2 = bitcast [30 x i8]* %tmp1 to i8*

Modified: llvm/trunk/test/CodeGen/X86/misched-matrix.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/misched-matrix.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/misched-matrix.ll (original)
+++ llvm/trunk/test/CodeGen/X86/misched-matrix.ll Wed Apr  4 22:19:36 2018
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \
 ; RUN:          -misched-topdown -verify-machineinstrs \
 ; RUN:     | FileCheck %s -check-prefix=TOPDOWN
@@ -16,232 +15,78 @@
 ; been reordered with the stores. This tests the scheduler's cheap
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
+; TOPDOWN-LABEL: %for.body
+; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN-NOT: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 8(
+; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN-LABEL: %for.end
+;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
+; ILPMIN-LABEL: %for.body
+; ILPMIN: movl %{{.*}}, (
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN-LABEL: %for.end
+;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
+; ILPMAX-LABEL: %for.body
+; ILPMAX: movl %{{.*}}, (
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX-LABEL: %for.end
+
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
-; TOPDOWN-LABEL: mmult:
-; TOPDOWN:       # %bb.0: # %entry
-; TOPDOWN-NEXT:    pushq %rbp
-; TOPDOWN-NEXT:    .cfi_def_cfa_offset 16
-; TOPDOWN-NEXT:    pushq %r14
-; TOPDOWN-NEXT:    .cfi_def_cfa_offset 24
-; TOPDOWN-NEXT:    pushq %rbx
-; TOPDOWN-NEXT:    .cfi_def_cfa_offset 32
-; TOPDOWN-NEXT:    .cfi_offset %rbx, -32
-; TOPDOWN-NEXT:    .cfi_offset %r14, -24
-; TOPDOWN-NEXT:    .cfi_offset %rbp, -16
-; TOPDOWN-NEXT:    xorl %eax, %eax
-; TOPDOWN-NEXT:    .p2align 4, 0x90
-; TOPDOWN-NEXT:  .LBB0_1: # %for.body
-; TOPDOWN-NEXT:    # =>This Inner Loop Header: Depth=1
-; TOPDOWN-NEXT:    movl (%rdi,%rax), %r10d
-; TOPDOWN-NEXT:    movl 4(%rdi,%rax), %r14d
-; TOPDOWN-NEXT:    movl 8(%rdi,%rax), %r9d
-; TOPDOWN-NEXT:    movl 12(%rdi,%rax), %r8d
-; TOPDOWN-NEXT:    movl (%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r10d, %ecx
-; TOPDOWN-NEXT:    movl 16(%rsi), %ebx
-; TOPDOWN-NEXT:    imull %r14d, %ebx
-; TOPDOWN-NEXT:    addl %ecx, %ebx
-; TOPDOWN-NEXT:    movl 32(%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r9d, %ecx
-; TOPDOWN-NEXT:    addl %ebx, %ecx
-; TOPDOWN-NEXT:    movl 48(%rsi), %r11d
-; TOPDOWN-NEXT:    imull %r8d, %r11d
-; TOPDOWN-NEXT:    addl %ecx, %r11d
-; TOPDOWN-NEXT:    movl 4(%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r10d, %ecx
-; TOPDOWN-NEXT:    movl 20(%rsi), %ebx
-; TOPDOWN-NEXT:    imull %r14d, %ebx
-; TOPDOWN-NEXT:    addl %ecx, %ebx
-; TOPDOWN-NEXT:    movl 36(%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r9d, %ecx
-; TOPDOWN-NEXT:    addl %ebx, %ecx
-; TOPDOWN-NEXT:    movl 52(%rsi), %ebx
-; TOPDOWN-NEXT:    imull %r8d, %ebx
-; TOPDOWN-NEXT:    addl %ecx, %ebx
-; TOPDOWN-NEXT:    movl 8(%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r10d, %ecx
-; TOPDOWN-NEXT:    movl 24(%rsi), %ebp
-; TOPDOWN-NEXT:    imull %r14d, %ebp
-; TOPDOWN-NEXT:    addl %ecx, %ebp
-; TOPDOWN-NEXT:    movl 40(%rsi), %ecx
-; TOPDOWN-NEXT:    imull %r9d, %ecx
-; TOPDOWN-NEXT:    addl %ebp, %ecx
-; TOPDOWN-NEXT:    movl 56(%rsi), %ebp
-; TOPDOWN-NEXT:    imull %r8d, %ebp
-; TOPDOWN-NEXT:    addl %ecx, %ebp
-; TOPDOWN-NEXT:    imull 12(%rsi), %r10d
-; TOPDOWN-NEXT:    movl %r11d, (%rdx,%rax)
-; TOPDOWN-NEXT:    imull 28(%rsi), %r14d
-; TOPDOWN-NEXT:    addl %r10d, %r14d
-; TOPDOWN-NEXT:    movl %ebx, 4(%rdx,%rax)
-; TOPDOWN-NEXT:    imull 44(%rsi), %r9d
-; TOPDOWN-NEXT:    addl %r14d, %r9d
-; TOPDOWN-NEXT:    movl %ebp, 8(%rdx,%rax)
-; TOPDOWN-NEXT:    imull 60(%rsi), %r8d
-; TOPDOWN-NEXT:    addl %r9d, %r8d
-; TOPDOWN-NEXT:    movl %r8d, 12(%rdx,%rax)
-; TOPDOWN-NEXT:    addq $16, %rax
-; TOPDOWN-NEXT:    cmpl $64, %eax
-; TOPDOWN-NEXT:    jne .LBB0_1
-; TOPDOWN-NEXT:  # %bb.2: # %for.end
-; TOPDOWN-NEXT:    popq %rbx
-; TOPDOWN-NEXT:    popq %r14
-; TOPDOWN-NEXT:    popq %rbp
-; TOPDOWN-NEXT:    retq
-;
-; ILPMIN-LABEL: mmult:
-; ILPMIN:       # %bb.0: # %entry
-; ILPMIN-NEXT:    pushq %rbp
-; ILPMIN-NEXT:    .cfi_def_cfa_offset 16
-; ILPMIN-NEXT:    pushq %r14
-; ILPMIN-NEXT:    .cfi_def_cfa_offset 24
-; ILPMIN-NEXT:    pushq %rbx
-; ILPMIN-NEXT:    .cfi_def_cfa_offset 32
-; ILPMIN-NEXT:    .cfi_offset %rbx, -32
-; ILPMIN-NEXT:    .cfi_offset %r14, -24
-; ILPMIN-NEXT:    .cfi_offset %rbp, -16
-; ILPMIN-NEXT:    xorl %r14d, %r14d
-; ILPMIN-NEXT:    .p2align 4, 0x90
-; ILPMIN-NEXT:  .LBB0_1: # %for.body
-; ILPMIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; ILPMIN-NEXT:    movl (%rdi,%r14), %r8d
-; ILPMIN-NEXT:    movl 8(%rdi,%r14), %r9d
-; ILPMIN-NEXT:    movl 4(%rdi,%r14), %r11d
-; ILPMIN-NEXT:    movl 12(%rdi,%r14), %r10d
-; ILPMIN-NEXT:    movl (%rsi), %ecx
-; ILPMIN-NEXT:    movl 16(%rsi), %ebx
-; ILPMIN-NEXT:    movl 32(%rsi), %ebp
-; ILPMIN-NEXT:    imull %r8d, %ecx
-; ILPMIN-NEXT:    imull %r11d, %ebx
-; ILPMIN-NEXT:    addl %ecx, %ebx
-; ILPMIN-NEXT:    imull %r9d, %ebp
-; ILPMIN-NEXT:    addl %ebx, %ebp
-; ILPMIN-NEXT:    movl 48(%rsi), %ecx
-; ILPMIN-NEXT:    imull %r10d, %ecx
-; ILPMIN-NEXT:    addl %ebp, %ecx
-; ILPMIN-NEXT:    movl %ecx, (%rdx,%r14)
-; ILPMIN-NEXT:    movl 52(%rsi), %ecx
-; ILPMIN-NEXT:    movl 4(%rsi), %ebx
-; ILPMIN-NEXT:    movl 20(%rsi), %ebp
-; ILPMIN-NEXT:    movl 36(%rsi), %eax
-; ILPMIN-NEXT:    imull %r8d, %ebx
-; ILPMIN-NEXT:    imull %r11d, %ebp
-; ILPMIN-NEXT:    addl %ebx, %ebp
-; ILPMIN-NEXT:    imull %r9d, %eax
-; ILPMIN-NEXT:    addl %ebp, %eax
-; ILPMIN-NEXT:    imull %r10d, %ecx
-; ILPMIN-NEXT:    addl %eax, %ecx
-; ILPMIN-NEXT:    movl %ecx, 4(%rdx,%r14)
-; ILPMIN-NEXT:    movl 56(%rsi), %eax
-; ILPMIN-NEXT:    movl 8(%rsi), %ecx
-; ILPMIN-NEXT:    movl 24(%rsi), %ebx
-; ILPMIN-NEXT:    movl 40(%rsi), %ebp
-; ILPMIN-NEXT:    imull %r8d, %ecx
-; ILPMIN-NEXT:    imull %r11d, %ebx
-; ILPMIN-NEXT:    addl %ecx, %ebx
-; ILPMIN-NEXT:    imull %r9d, %ebp
-; ILPMIN-NEXT:    addl %ebx, %ebp
-; ILPMIN-NEXT:    imull %r10d, %eax
-; ILPMIN-NEXT:    addl %ebp, %eax
-; ILPMIN-NEXT:    movl %eax, 8(%rdx,%r14)
-; ILPMIN-NEXT:    imull 12(%rsi), %r8d
-; ILPMIN-NEXT:    imull 28(%rsi), %r11d
-; ILPMIN-NEXT:    addl %r8d, %r11d
-; ILPMIN-NEXT:    imull 44(%rsi), %r9d
-; ILPMIN-NEXT:    addl %r11d, %r9d
-; ILPMIN-NEXT:    imull 60(%rsi), %r10d
-; ILPMIN-NEXT:    addl %r9d, %r10d
-; ILPMIN-NEXT:    movl %r10d, 12(%rdx,%r14)
-; ILPMIN-NEXT:    addq $16, %r14
-; ILPMIN-NEXT:    cmpl $64, %r14d
-; ILPMIN-NEXT:    jne .LBB0_1
-; ILPMIN-NEXT:  # %bb.2: # %for.end
-; ILPMIN-NEXT:    popq %rbx
-; ILPMIN-NEXT:    popq %r14
-; ILPMIN-NEXT:    popq %rbp
-; ILPMIN-NEXT:    retq
-;
-; ILPMAX-LABEL: mmult:
-; ILPMAX:       # %bb.0: # %entry
-; ILPMAX-NEXT:    pushq %rbp
-; ILPMAX-NEXT:    .cfi_def_cfa_offset 16
-; ILPMAX-NEXT:    pushq %r15
-; ILPMAX-NEXT:    .cfi_def_cfa_offset 24
-; ILPMAX-NEXT:    pushq %r14
-; ILPMAX-NEXT:    .cfi_def_cfa_offset 32
-; ILPMAX-NEXT:    pushq %rbx
-; ILPMAX-NEXT:    .cfi_def_cfa_offset 40
-; ILPMAX-NEXT:    .cfi_offset %rbx, -40
-; ILPMAX-NEXT:    .cfi_offset %r14, -32
-; ILPMAX-NEXT:    .cfi_offset %r15, -24
-; ILPMAX-NEXT:    .cfi_offset %rbp, -16
-; ILPMAX-NEXT:    xorl %r15d, %r15d
-; ILPMAX-NEXT:    .p2align 4, 0x90
-; ILPMAX-NEXT:  .LBB0_1: # %for.body
-; ILPMAX-NEXT:    # =>This Inner Loop Header: Depth=1
-; ILPMAX-NEXT:    movl (%rdi,%r15), %r8d
-; ILPMAX-NEXT:    movl 8(%rdi,%r15), %r9d
-; ILPMAX-NEXT:    movl 4(%rdi,%r15), %r14d
-; ILPMAX-NEXT:    movl 12(%rdi,%r15), %r10d
-; ILPMAX-NEXT:    movl 16(%rsi), %ebx
-; ILPMAX-NEXT:    imull %r14d, %ebx
-; ILPMAX-NEXT:    movl (%rsi), %ebp
-; ILPMAX-NEXT:    imull %r8d, %ebp
-; ILPMAX-NEXT:    movl 32(%rsi), %ecx
-; ILPMAX-NEXT:    imull %r9d, %ecx
-; ILPMAX-NEXT:    movl 48(%rsi), %r11d
-; ILPMAX-NEXT:    imull %r10d, %r11d
-; ILPMAX-NEXT:    addl %ebp, %ebx
-; ILPMAX-NEXT:    addl %ebx, %ecx
-; ILPMAX-NEXT:    addl %ecx, %r11d
-; ILPMAX-NEXT:    movl %r11d, (%rdx,%r15)
-; ILPMAX-NEXT:    movl 52(%rsi), %ecx
-; ILPMAX-NEXT:    imull %r10d, %ecx
-; ILPMAX-NEXT:    movl 4(%rsi), %ebp
-; ILPMAX-NEXT:    imull %r8d, %ebp
-; ILPMAX-NEXT:    movl 20(%rsi), %ebx
-; ILPMAX-NEXT:    imull %r14d, %ebx
-; ILPMAX-NEXT:    movl 36(%rsi), %eax
-; ILPMAX-NEXT:    imull %r9d, %eax
-; ILPMAX-NEXT:    addl %ebp, %ebx
-; ILPMAX-NEXT:    addl %ebx, %eax
-; ILPMAX-NEXT:    addl %eax, %ecx
-; ILPMAX-NEXT:    movl %ecx, 4(%rdx,%r15)
-; ILPMAX-NEXT:    movl 56(%rsi), %eax
-; ILPMAX-NEXT:    imull %r10d, %eax
-; ILPMAX-NEXT:    movl 8(%rsi), %ecx
-; ILPMAX-NEXT:    imull %r8d, %ecx
-; ILPMAX-NEXT:    movl 24(%rsi), %ebp
-; ILPMAX-NEXT:    imull %r14d, %ebp
-; ILPMAX-NEXT:    movl 40(%rsi), %ebx
-; ILPMAX-NEXT:    imull %r9d, %ebx
-; ILPMAX-NEXT:    addl %ecx, %ebp
-; ILPMAX-NEXT:    addl %ebp, %ebx
-; ILPMAX-NEXT:    addl %ebx, %eax
-; ILPMAX-NEXT:    movl %eax, 8(%rdx,%r15)
-; ILPMAX-NEXT:    imull 60(%rsi), %r10d
-; ILPMAX-NEXT:    imull 12(%rsi), %r8d
-; ILPMAX-NEXT:    imull 28(%rsi), %r14d
-; ILPMAX-NEXT:    imull 44(%rsi), %r9d
-; ILPMAX-NEXT:    addl %r8d, %r14d
-; ILPMAX-NEXT:    addl %r14d, %r9d
-; ILPMAX-NEXT:    addl %r9d, %r10d
-; ILPMAX-NEXT:    movl %r10d, 12(%rdx,%r15)
-; ILPMAX-NEXT:    addq $16, %r15
-; ILPMAX-NEXT:    cmpl $64, %r15d
-; ILPMAX-NEXT:    jne .LBB0_1
-; ILPMAX-NEXT:  # %bb.2: # %for.end
-; ILPMAX-NEXT:    popq %rbx
-; ILPMAX-NEXT:    popq %r14
-; ILPMAX-NEXT:    popq %r15
-; ILPMAX-NEXT:    popq %rbp
-; ILPMAX-NEXT:    retq
 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
 entry:
   br label %for.body

Modified: llvm/trunk/test/CodeGen/X86/mmx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mmx-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mmx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mmx-schedule.ll Wed Apr  4 22:19:36 2018
@@ -642,7 +642,7 @@ define i32 @test_movd(x86_mmx %a0, i32 %
 ; GENERIC-NEXT:    paddd %mm2, %mm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    movd %mm2, %ecx # sched: [1:0.33]
 ; GENERIC-NEXT:    movd %mm0, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movl %ecx, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movd:
@@ -675,7 +675,7 @@ define i32 @test_movd(x86_mmx %a0, i32 %
 ; SANDY-NEXT:    paddd %mm2, %mm0 # sched: [3:1.00]
 ; SANDY-NEXT:    movd %mm2, %ecx # sched: [1:0.33]
 ; SANDY-NEXT:    movd %mm0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    movl %ecx, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movd:

Modified: llvm/trunk/test/CodeGen/X86/pr31045.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr31045.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr31045.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr31045.ll Wed Apr  4 22:19:36 2018
@@ -21,25 +21,26 @@ define void @_Z1av() local_unnamed_addr
 ; CHECK-NEXT:    movl struct_obj_3+{{.*}}(%rip), %eax
 ; CHECK-NEXT:    movsbl {{.*}}(%rip), %ecx
 ; CHECK-NEXT:    movzbl {{.*}}(%rip), %edx
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %esi
 ; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    leal (%rax,%rax), %esi
-; CHECK-NEXT:    subl %ecx, %esi
-; CHECK-NEXT:    subl %edx, %esi
+; CHECK-NEXT:    leal (%rax,%rax), %edi
+; CHECK-NEXT:    subl %ecx, %edi
+; CHECK-NEXT:    subl %edx, %edi
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    movzbl %cl, %ecx
+; CHECK-NEXT:    movw %cx, struct_obj_12+{{.*}}(%rip)
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    cmovel %eax, %ecx
-; CHECK-NEXT:    movzbl {{.*}}(%rip), %edx
 ; CHECK-NEXT:    andl struct_obj_8+{{.*}}(%rip), %ecx
 ; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    andl %edx, %ecx
+; CHECK-NEXT:    andl %esi, %ecx
 ; CHECK-NEXT:    negl %ecx
 ; CHECK-NEXT:    andl %eax, %ecx
 ; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    testl %ecx, %esi
-; CHECK-NEXT:    notl %esi
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movw %ax, struct_obj_12+{{.*}}(%rip)
+; CHECK-NEXT:    testl %ecx, %edi
 ; CHECK-NEXT:    setne {{.*}}(%rip)
 ; CHECK-NEXT:    retq
 entry:

Modified: llvm/trunk/test/CodeGen/X86/pr34080.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr34080.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr34080.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr34080.ll Wed Apr  4 22:19:36 2018
@@ -54,10 +54,10 @@ define void @_Z1fe(x86_fp80 %z) local_un
 ; SSE2-SCHEDULE-NEXT:    movq %rsp, %rbp
 ; SSE2-SCHEDULE-NEXT:    .cfi_def_cfa_register %rbp
 ; SSE2-SCHEDULE-NEXT:    fnstcw -4(%rbp)
+; SSE2-SCHEDULE-NEXT:    fldt 16(%rbp)
 ; SSE2-SCHEDULE-NEXT:    movzwl -4(%rbp), %eax
 ; SSE2-SCHEDULE-NEXT:    movw $3199, -4(%rbp) ## imm = 0xC7F
 ; SSE2-SCHEDULE-NEXT:    fldcw -4(%rbp)
-; SSE2-SCHEDULE-NEXT:    fldt 16(%rbp)
 ; SSE2-SCHEDULE-NEXT:    movw %ax, -4(%rbp)
 ; SSE2-SCHEDULE-NEXT:    fistl -8(%rbp)
 ; SSE2-SCHEDULE-NEXT:    fldcw -4(%rbp)
@@ -65,12 +65,12 @@ define void @_Z1fe(x86_fp80 %z) local_un
 ; SSE2-SCHEDULE-NEXT:    movsd %xmm0, -64(%rbp)
 ; SSE2-SCHEDULE-NEXT:    movsd %xmm0, -32(%rbp)
 ; SSE2-SCHEDULE-NEXT:    fsubl -32(%rbp)
-; SSE2-SCHEDULE-NEXT:    fnstcw -2(%rbp)
 ; SSE2-SCHEDULE-NEXT:    flds {{.*}}(%rip)
+; SSE2-SCHEDULE-NEXT:    fnstcw -2(%rbp)
+; SSE2-SCHEDULE-NEXT:    fmul %st(0), %st(1)
 ; SSE2-SCHEDULE-NEXT:    movzwl -2(%rbp), %eax
 ; SSE2-SCHEDULE-NEXT:    movw $3199, -2(%rbp) ## imm = 0xC7F
 ; SSE2-SCHEDULE-NEXT:    fldcw -2(%rbp)
-; SSE2-SCHEDULE-NEXT:    fmul %st(0), %st(1)
 ; SSE2-SCHEDULE-NEXT:    movw %ax, -2(%rbp)
 ; SSE2-SCHEDULE-NEXT:    fxch %st(1)
 ; SSE2-SCHEDULE-NEXT:    fistl -12(%rbp)

Modified: llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll (original)
+++ llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll Wed Apr  4 22:19:36 2018
@@ -403,7 +403,7 @@ define void @lshift_mem_b(i64 %b) nounwi
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
 ; GENERIC-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
+; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b:
@@ -437,7 +437,7 @@ define void @lshift_mem_b_optsize(i64 %b
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
 ; GENERIC-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
+; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b_optsize:

Modified: llvm/trunk/test/CodeGen/X86/schedule-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/schedule-x86_64.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/schedule-x86_64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/schedule-x86_64.ll Wed Apr  4 22:19:36 2018
@@ -7669,8 +7669,8 @@ define void @test_movnti(i32 %a0, i32 *%
 ; GENERIC-LABEL: test_movnti:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    movntil %edi, (%rsi) # sched: [1:1.00]
-; GENERIC-NEXT:    movntiq %rdx, (%rcx) # sched: [1:1.00]
+; GENERIC-NEXT:    movntil %edi, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    movntiq %rdx, (%rcx) # sched: [5:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7693,8 +7693,8 @@ define void @test_movnti(i32 %a0, i32 *%
 ; SANDY-LABEL: test_movnti:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    #APP
-; SANDY-NEXT:    movntil %edi, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    movntiq %rdx, (%rcx) # sched: [1:1.00]
+; SANDY-NEXT:    movntil %edi, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    movntiq %rdx, (%rcx) # sched: [5:1.00]
 ; SANDY-NEXT:    #NO_APP
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13540,22 +13540,22 @@ define void @test_setcc(i8 %a0, i8 *%a1)
 ; GENERIC-NEXT:    setge %dil # sched: [1:0.50]
 ; GENERIC-NEXT:    setle %dil # sched: [1:0.50]
 ; GENERIC-NEXT:    setg %dil # sched: [1:0.50]
-; GENERIC-NEXT:    seto (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setno (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setb (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setae (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    sete (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setne (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setbe (%rsi) # sched: [3:1.00]
-; GENERIC-NEXT:    seta (%rsi) # sched: [3:1.00]
-; GENERIC-NEXT:    sets (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setns (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setp (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setnp (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setl (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setge (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setle (%rsi) # sched: [2:1.00]
-; GENERIC-NEXT:    setg (%rsi) # sched: [2:1.00]
+; GENERIC-NEXT:    seto (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setno (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setb (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setae (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    sete (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setne (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setbe (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    seta (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    sets (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setns (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setp (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setnp (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setl (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setge (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setle (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    setg (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13654,22 +13654,22 @@ define void @test_setcc(i8 %a0, i8 *%a1)
 ; SANDY-NEXT:    setge %dil # sched: [1:0.50]
 ; SANDY-NEXT:    setle %dil # sched: [1:0.50]
 ; SANDY-NEXT:    setg %dil # sched: [1:0.50]
-; SANDY-NEXT:    seto (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setno (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setb (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setae (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    sete (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setne (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setbe (%rsi) # sched: [3:1.00]
-; SANDY-NEXT:    seta (%rsi) # sched: [3:1.00]
-; SANDY-NEXT:    sets (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setns (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setp (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setnp (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setl (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setge (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setle (%rsi) # sched: [2:1.00]
-; SANDY-NEXT:    setg (%rsi) # sched: [2:1.00]
+; SANDY-NEXT:    seto (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setno (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setb (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setae (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    sete (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setne (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setbe (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    seta (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    sets (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setns (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setp (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setnp (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setl (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setge (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setle (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    setg (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    #NO_APP
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;

Modified: llvm/trunk/test/CodeGen/X86/sse-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll Wed Apr  4 22:19:36 2018
@@ -1939,7 +1939,7 @@ define float @test_divss(float %a0, floa
 define void @test_ldmxcsr(i32 %a0) {
 ; GENERIC-LABEL: test_ldmxcsr:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; GENERIC-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; GENERIC-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1957,13 +1957,13 @@ define void @test_ldmxcsr(i32 %a0) {
 ;
 ; SANDY-SSE-LABEL: test_ldmxcsr:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_ldmxcsr:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SANDY-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; SANDY-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2487,7 +2487,7 @@ define void @test_movaps(<4 x float> *%a
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movaps (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movaps %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movaps %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movaps:
@@ -2508,14 +2508,14 @@ define void @test_movaps(<4 x float> *%a
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movaps %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movaps %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movaps:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovaps (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovaps %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movaps:
@@ -2712,7 +2712,7 @@ define void @test_movhps(<4 x float> %a0
 ; GENERIC-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    addps %xmm0, %xmm1 # sched: [3:1.00]
 ; GENERIC-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1] sched: [1:1.00]
-; GENERIC-NEXT:    movlps %xmm1, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movlps %xmm1, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movhps:
@@ -2736,7 +2736,7 @@ define void @test_movhps(<4 x float> %a0
 ; SANDY-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addps %xmm0, %xmm1 # sched: [3:1.00]
 ; SANDY-SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1] sched: [1:1.00]
-; SANDY-SSE-NEXT:    movlps %xmm1, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movlps %xmm1, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movhps:
@@ -2959,7 +2959,7 @@ define void @test_movlps(<4 x float> %a0
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; GENERIC-NEXT:    addps %xmm0, %xmm1 # sched: [3:1.00]
-; GENERIC-NEXT:    movlps %xmm1, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movlps %xmm1, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movlps:
@@ -2980,14 +2980,14 @@ define void @test_movlps(<4 x float> %a0
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addps %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movlps %xmm1, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movlps %xmm1, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movlps:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovlps %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movlps:
@@ -3178,7 +3178,7 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x
 define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ; GENERIC-LABEL: test_movntps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movntps %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movntps %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movntps:
@@ -3199,12 +3199,12 @@ define void @test_movntps(<4 x float> %a
 ;
 ; SANDY-SSE-LABEL: test_movntps:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntps:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntps %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movntps:
@@ -3275,7 +3275,7 @@ define void @test_movss_mem(float* %a0,
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
 ; GENERIC-NEXT:    addss %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movss %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movss %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movss_mem:
@@ -3296,14 +3296,14 @@ define void @test_movss_mem(float* %a0,
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addss %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movss_mem:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovss %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movss_mem:
@@ -3495,7 +3495,7 @@ define void @test_movups(<4 x float> *%a
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movups (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movups %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movups %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movups:
@@ -3516,14 +3516,14 @@ define void @test_movups(<4 x float> *%a
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movups %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movups %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movups:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movups:

Modified: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-schedule.ll?rev=329256&r1=329255&r2=329256&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll Wed Apr  4 22:19:36 2018
@@ -4104,7 +4104,7 @@ define void @test_movapd(<2 x double> *%
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movapd (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movapd %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movapd %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movapd:
@@ -4125,14 +4125,14 @@ define void @test_movapd(<2 x double> *%
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movapd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movapd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movapd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovapd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovapd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movapd:
@@ -4229,7 +4229,7 @@ define void @test_movdqa(<2 x i64> *%a0,
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    movdqa %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movdqa %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movdqa:
@@ -4250,14 +4250,14 @@ define void @test_movdqa(<2 x i64> *%a0,
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-SSE-NEXT:    movdqa %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movdqa %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movdqa:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movdqa:
@@ -4354,7 +4354,7 @@ define void @test_movdqu(<2 x i64> *%a0,
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movdqu (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    movdqu %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movdqu %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movdqu:
@@ -4375,14 +4375,14 @@ define void @test_movdqu(<2 x i64> *%a0,
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-SSE-NEXT:    movdqu %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movdqu %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movdqu:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movdqu:
@@ -4482,7 +4482,7 @@ define i32 @test_movd(<4 x i32> %a0, i32
 ; GENERIC-NEXT:    paddd %xmm0, %xmm1 # sched: [1:0.50]
 ; GENERIC-NEXT:    paddd %xmm0, %xmm2 # sched: [1:0.50]
 ; GENERIC-NEXT:    movd %xmm2, %eax # sched: [2:1.00]
-; GENERIC-NEXT:    movd %xmm1, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movd %xmm1, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movd:
@@ -4512,7 +4512,7 @@ define i32 @test_movd(<4 x i32> %a0, i32
 ; SANDY-SSE-NEXT:    paddd %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    paddd %xmm0, %xmm2 # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    movd %xmm2, %eax # sched: [2:1.00]
-; SANDY-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movd:
@@ -4522,7 +4522,7 @@ define i32 @test_movd(<4 x i32> %a0, i32
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SANDY-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
-; SANDY-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovd %xmm1, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movd:
@@ -4663,7 +4663,7 @@ define i64 @test_movd_64(<2 x i64> %a0,
 ; GENERIC-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
 ; GENERIC-NEXT:    paddq %xmm0, %xmm2 # sched: [1:0.50]
 ; GENERIC-NEXT:    movq %xmm2, %rax # sched: [2:1.00]
-; GENERIC-NEXT:    movq %xmm1, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movq %xmm1, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movd_64:
@@ -4693,7 +4693,7 @@ define i64 @test_movd_64(<2 x i64> %a0,
 ; SANDY-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    paddq %xmm0, %xmm2 # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    movq %xmm2, %rax # sched: [2:1.00]
-; SANDY-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movd_64:
@@ -4703,7 +4703,7 @@ define i64 @test_movd_64(<2 x i64> %a0,
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SANDY-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
-; SANDY-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovq %xmm1, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movd_64:
@@ -4841,7 +4841,7 @@ define void @test_movhpd(<2 x double> %a
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    addpd %xmm0, %xmm1 # sched: [3:1.00]
-; GENERIC-NEXT:    movhpd %xmm1, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movhpd %xmm1, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movhpd:
@@ -4862,14 +4862,14 @@ define void @test_movhpd(<2 x double> %a
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movhpd %xmm1, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movhpd %xmm1, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movhpd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movhpd:
@@ -4969,7 +4969,7 @@ define void @test_movlpd(<2 x double> %a
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; GENERIC-NEXT:    addpd %xmm0, %xmm1 # sched: [3:1.00]
-; GENERIC-NEXT:    movlpd %xmm1, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movlpd %xmm1, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movlpd:
@@ -4990,14 +4990,14 @@ define void @test_movlpd(<2 x double> %a
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movlpd %xmm1, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movlpd %xmm1, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movlpd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movlpd:
@@ -5188,7 +5188,7 @@ define void @test_movntdqa(<2 x i64> %a0
 ; GENERIC-LABEL: test_movntdqa:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    movntdq %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movntdq %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movntdqa:
@@ -5208,13 +5208,13 @@ define void @test_movntdqa(<2 x i64> %a0
 ; SANDY-SSE-LABEL: test_movntdqa:
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntdqa:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movntdqa:
@@ -5297,7 +5297,7 @@ define void @test_movntpd(<2 x double> %
 ; GENERIC-LABEL: test_movntpd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movntpd %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movntpd %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movntpd:
@@ -5315,13 +5315,13 @@ define void @test_movntpd(<2 x double> %
 ; SANDY-SSE-LABEL: test_movntpd:
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movntpd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movntpd:
@@ -5405,7 +5405,7 @@ define <2 x i64> @test_movq_mem(<2 x i64
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; GENERIC-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    movq %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movq %xmm0, (%rdi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movq_mem:
@@ -5426,14 +5426,14 @@ define <2 x i64> @test_movq_mem(<2 x i64
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; SANDY-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movq_mem:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovq %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movq_mem:
@@ -5641,7 +5641,7 @@ define void @test_movsd_mem(double* %a0,
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
 ; GENERIC-NEXT:    addsd %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movsd %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movsd %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movsd_mem:
@@ -5662,14 +5662,14 @@ define void @test_movsd_mem(double* %a0,
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addsd %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movsd_mem:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovsd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movsd_mem:
@@ -5869,7 +5869,7 @@ define void @test_movupd(<2 x double> *%
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movupd (%rdi), %xmm0 # sched: [6:0.50]
 ; GENERIC-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movupd %xmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT:    movupd %xmm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_movupd:
@@ -5890,14 +5890,14 @@ define void @test_movupd(<2 x double> *%
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movupd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movupd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movupd:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovupd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_movupd:




More information about the llvm-commits mailing list