[llvm] r331380 - [X86] Cleanup WriteFShuffle/WriteFVarShuffle (+256 variants) scheduler classes with more common default values

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed May 2 10:58:50 PDT 2018


Author: rksimon
Date: Wed May  2 10:58:50 2018
New Revision: 331380

URL: http://llvm.org/viewvc/llvm-project?rev=331380&view=rev
Log:
[X86] Cleanup WriteFShuffle/WriteFVarShuffle (+256 variants) scheduler classes with more common default values

Modified:
    llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
    llvm/trunk/lib/Target/X86/X86SchedHaswell.td
    llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
    llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
    llvm/trunk/test/CodeGen/X86/xop-schedule.ll

Modified: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedBroadwell.td?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td Wed May  2 10:58:50 2018
@@ -205,8 +205,8 @@ defm : BWWriteResPair<WriteVecLogicY,[BW
 defm : BWWriteResPair<WriteVecShift, [BWPort0],  1>; // Vector integer shifts.
 defm : BWWriteResPair<WriteVecIMul,  [BWPort0],   5>; // Vector integer multiply.
 defm : BWWriteResPair<WritePMULLD,   [BWPort0], 10, [2], 2, 5>; // PMULLD
-defm : BWWriteResPair<WriteShuffle,  [BWPort5],  1>; // Vector shuffles.
-defm : BWWriteResPair<WriteVarShuffle, [BWPort5],  1>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteShuffle,  [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
 defm : BWWriteResPair<WriteBlend,  [BWPort5],  1>; // Vector blends.
 defm : BWWriteResPair<WriteVarBlend,  [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
 defm : BWWriteResPair<WriteMPSAD,  [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD.
@@ -334,11 +334,11 @@ defm : BWWriteResPair<WriteCLMul,  [BWPo
 def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : BWWriteResPair<WriteFShuffle256,  [BWPort5],  3>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteFVarShuffle256,  [BWPort5],  3>; // Fp 256-bit width vector variable shuffles.
-defm : BWWriteResPair<WriteShuffle256,  [BWPort5],  3>;  // 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteVarShuffle256,  [BWPort5],  3>;  // 256-bit width vector variable shuffles.
-defm : BWWriteResPair<WriteVarVecShift,  [BWPort0, BWPort5], 2, [2, 1]>; // Variable vector shifts.
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
+defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 2, [2, 1]>; // Variable vector shifts.
 
 // Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;

Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Wed May  2 10:58:50 2018
@@ -172,14 +172,14 @@ defm : HWWriteResPair<WriteFMAY,  [HWPor
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
 defm : HWWriteResPair<WriteFLogic,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
-defm : HWWriteResPair<WriteFShuffle,  [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteFShuffle,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFShuffleY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteFVarShuffle,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFVarShuffleY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteFBlend,  [HWPort015], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFBlendY, [HWPort015], 1, [1], 1, 7>;
-defm : HWWriteResPair<WriteFShuffle256,  [HWPort5],  3>;
-defm : HWWriteResPair<WriteFVarShuffle256,  [HWPort5],  3>;
+defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffle256, [HWPort5], 3, [1], 1, 7>;
 defm : HWWriteResPair<WriteFVarBlend,  [HWPort5], 2, [2], 2, 6>;
 defm : HWWriteResPair<WriteFVarBlendY, [HWPort5], 2, [2], 2, 7>;
 
@@ -200,11 +200,11 @@ defm : HWWriteResPair<WriteVecLogicY,[HW
 defm : HWWriteResPair<WriteVecALU,   [HWPort15],  1>;
 defm : HWWriteResPair<WriteVecIMul,  [HWPort0],   5>;
 defm : HWWriteResPair<WritePMULLD,   [HWPort0], 10, [2], 2, 6>;
-defm : HWWriteResPair<WriteShuffle,  [HWPort5],  1>;
-defm : HWWriteResPair<WriteVarShuffle,  [HWPort5],  1>;
+defm : HWWriteResPair<WriteShuffle,  [HWPort5],  1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVarShuffle,[HWPort5], 1, [1], 1, 5>;
 defm : HWWriteResPair<WriteBlend,  [HWPort5], 1, [1], 1, 6>;
-defm : HWWriteResPair<WriteShuffle256,  [HWPort5],  3>;
-defm : HWWriteResPair<WriteVarShuffle256,  [HWPort5],  3>;
+defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
 defm : HWWriteResPair<WriteVarBlend,  [HWPort5], 2, [2], 2, 6>;
 defm : HWWriteResPair<WriteVarVecShift,  [HWPort0, HWPort5], 2, [2, 1]>;
 defm : HWWriteResPair<WriteMPSAD,  [HWPort0, HWPort5], 7, [1, 2], 3, 6>;
@@ -221,6 +221,7 @@ def : WriteRes<WriteVecInsertLd, [HWPort
   let Latency = 6;
   let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
 def : WriteRes<WriteVecExtract, [HWPort0,HWPort5]> {
   let Latency = 2;
@@ -874,14 +875,11 @@ def HWWriteResGroup13 : SchedWriteRes<[H
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup13], (instregex "(V?)INSERTPSrm",
-                                            "(V?)PACKSSDWrm",
+def: InstRW<[HWWriteResGroup13], (instregex "(V?)PACKSSDWrm",
                                             "(V?)PACKSSWBrm",
                                             "(V?)PACKUSDWrm",
                                             "(V?)PACKUSWBrm",
                                             "(V?)PALIGNRrmi",
-                                            "VPERMILPDmi",
-                                            "VPERMILPSmi",
                                             "(V?)PSHUFBrm",
                                             "(V?)PSHUFDmi",
                                             "(V?)PSHUFHWmi",
@@ -893,13 +891,7 @@ def: InstRW<[HWWriteResGroup13], (instre
                                             "(V?)PUNPCKLBWrm",
                                             "(V?)PUNPCKLDQrm",
                                             "(V?)PUNPCKLQDQrm",
-                                            "(V?)PUNPCKLWDrm",
-                                            "(V?)SHUFPDrmi",
-                                            "(V?)SHUFPSrmi",
-                                            "(V?)UNPCKHPDrm",
-                                            "(V?)UNPCKHPSrm",
-                                            "(V?)UNPCKLPDrm",
-                                            "(V?)UNPCKLPSrm")>;
+                                            "(V?)PUNPCKLWDrm")>;
 
 def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> {
   let Latency = 8;
@@ -1415,13 +1407,7 @@ def HWWriteResGroup53 : SchedWriteRes<[H
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm",
-                                            "VPERM2I128rm",
-                                            "VPERMDYrm",
-                                            "VPERMPDYmi",
-                                            "VPERMPSYrm",
-                                            "VPERMQYmi",
-                                            "VPMOVZXBDYrm",
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm",
                                             "VPMOVZXBQYrm",
                                             "VPMOVZXBWYrm",
                                             "VPMOVZXDQYrm",
@@ -1798,8 +1784,8 @@ def HWWriteResGroup89 : SchedWriteRes<[H
   let ResourceCycles = [1];
 }
 def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr",
-                                            "MUL_FPrST0",
-                                            "MUL_FST0r",
+                                            "MUL_FPrST0",
+                                            "MUL_FST0r",
                                             "MUL_FrST0")>;
 
 def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23,HWFPDivider]> {

Modified: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td Wed May  2 10:58:50 2018
@@ -159,10 +159,10 @@ defm : SBWriteResPair<WriteCvtF2F, [SBPo
 defm : SBWriteResPair<WriteFSign,    [SBPort5], 1>;
 defm : SBWriteResPair<WriteFLogic,   [SBPort5], 1, [1], 1, 6>;
 defm : SBWriteResPair<WriteFLogicY,  [SBPort5], 1, [1], 1, 7>;
-defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 6>;
 defm : SBWriteResPair<WriteFShuffleY,[SBPort5], 1, [1], 1, 7>;
-defm : SBWriteResPair<WriteFVarShuffle, [SBPort5],  1>;
-defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5],  1>;
+defm : SBWriteResPair<WriteFVarShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5], 1, [1], 1, 7>;
 defm : SBWriteResPair<WriteFBlend,    [SBPort05], 1, [1], 1, 6>;
 defm : SBWriteResPair<WriteFBlendY,   [SBPort05], 1, [1], 1, 7>;
 defm : SBWriteResPair<WriteFVarBlend, [SBPort05], 2, [2], 2, 6>;
@@ -180,8 +180,8 @@ defm : SBWriteResPair<WriteVecLogicY,[SB
 defm : SBWriteResPair<WriteVecALU,   [SBPort1], 3>;
 defm : SBWriteResPair<WriteVecIMul,  [SBPort0], 5>;
 defm : SBWriteResPair<WritePMULLD,   [SBPort0], 5, [1], 1, 6>; // TODO this is probably wrong for 256/512-bit for the "generic" model
-defm : SBWriteResPair<WriteShuffle,  [SBPort5], 1>;
-defm : SBWriteResPair<WriteVarShuffle,  [SBPort15], 1>;
+defm : SBWriteResPair<WriteShuffle,  [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVarShuffle,  [SBPort15], 1, [1], 1, 5>;
 defm : SBWriteResPair<WriteBlend,   [SBPort15], 1, [1], 1, 6>;
 defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>;
 defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>;
@@ -322,10 +322,10 @@ def : WriteRes<WriteNop, []>;
 
 // AVX2/FMA is not supported on that architecture, but we should define the basic
 // scheduling resources anyway.
-defm : SBWriteResPair<WriteFShuffle256, [SBPort0],  1>;
-defm : SBWriteResPair<WriteFVarShuffle256, [SBPort0],  1>;
-defm : SBWriteResPair<WriteShuffle256, [SBPort0],  1>;
-defm : SBWriteResPair<WriteVarShuffle256, [SBPort0],  1>;
+defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>;
 defm : SBWriteResPair<WriteVarVecShift, [SBPort0],  1>;
 defm : SBWriteResPair<WriteFMA, [SBPort01],  5>;
 defm : SBWriteResPair<WriteFMAS, [SBPort01],  5>;
@@ -372,9 +372,6 @@ def: InstRW<[SBWriteResGroup2], (instreg
                                            "RETQ",
                                            "ST_FPrr",
                                            "ST_Frr",
-                                           "VEXTRACTF128rr",
-                                           "VINSERTF128rr",
-                                           "VPERM2F128rr",
                                            "(V?)MOV64toPQIrr",
                                            "(V?)MOVDI2PDIrr")>;
 
@@ -936,28 +933,6 @@ def: InstRW<[SBWriteResGroup55], (instre
                                             "VTESTPDrm",
                                             "VTESTPSrm")>;
 
-def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128",
-                                            "(V?)INSERTPSrm",
-                                            "(V?)MOVHPDrm",
-                                            "(V?)MOVHPSrm",
-                                            "(V?)MOVLPDrm",
-                                            "(V?)MOVLPSrm",
-                                            "VPERMILPDmi",
-                                            "VPERMILPDrm",
-                                            "VPERMILPSmi",
-                                            "VPERMILPSrm",
-                                            "(V?)SHUFPDrmi",
-                                            "(V?)SHUFPSrmi",
-                                            "(V?)UNPCKHPDrm",
-                                            "(V?)UNPCKHPSrm",
-                                            "(V?)UNPCKLPDrm",
-                                            "(V?)UNPCKLPSrm")>;
-
 def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> {
   let Latency = 7;
   let NumMicroOps = 2;
@@ -1135,15 +1110,6 @@ def SBWriteResGroup72 : SchedWriteRes<[S
 }
 def: InstRW<[SBWriteResGroup72], (instrs MUL8m)>;
 
-def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm",
-                                            "VPERMILPDYrm",
-                                            "VPERMILPSYrm")>;
-
 def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> {
   let Latency = 8;
   let NumMicroOps = 3;

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td Wed May  2 10:58:50 2018
@@ -172,10 +172,10 @@ defm : SKLWriteResPair<WriteFMAY,   [SKL
 defm : SKLWriteResPair<WriteFSign,   [SKLPort0], 1>; // Floating point fabs/fchs.
 defm : SKLWriteResPair<WriteFLogic,  [SKLPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
 defm : SKLWriteResPair<WriteFLogicY, [SKLPort015], 1, [1], 1, 7>; // Floating point and/or/xor logicals (YMM/ZMM).
-defm : SKLWriteResPair<WriteFShuffle,  [SKLPort5], 1, [1], 1, 5>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFShuffle,  [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
 defm : SKLWriteResPair<WriteFShuffleY, [SKLPort5], 1, [1], 1, 7>; // Floating point vector shuffles (YMM/ZMM).
-defm : SKLWriteResPair<WriteFVarShuffle,  [SKLPort5],  1>; // Floating point vector shuffles.
-defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5],  1>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffle,  [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5], 1, [1], 1, 7>; // Floating point vector shuffles.
 defm : SKLWriteResPair<WriteFBlend,  [SKLPort015], 1, [1], 1, 6>; // Floating point vector blends.
 defm : SKLWriteResPair<WriteFBlendY, [SKLPort015], 1, [1], 1, 7>; // Floating point vector blends.
 defm : SKLWriteResPair<WriteFVarBlend, [SKLPort015], 2, [2], 2, 6>; // Fp vector variable blends.
@@ -201,8 +201,8 @@ defm : SKLWriteResPair<WriteVecLogicY,[S
 defm : SKLWriteResPair<WriteVecShift, [SKLPort0],  1>; // Vector integer shifts.
 defm : SKLWriteResPair<WriteVecIMul,  [SKLPort0],   5>; // Vector integer multiply.
 defm : SKLWriteResPair<WritePMULLD,   [SKLPort01], 10, [2], 2, 6>;
-defm : SKLWriteResPair<WriteShuffle,  [SKLPort5],  1>; // Vector shuffles.
-defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5],  1>; // Vector shuffles.
+defm : SKLWriteResPair<WriteShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
 defm : SKLWriteResPair<WriteBlend,  [SKLPort5], 1, [1], 1, 6>; // Vector blends.
 defm : SKLWriteResPair<WriteVarBlend,  [SKLPort015], 2, [2], 2, 6>; // Vector variable blends.
 defm : SKLWriteResPair<WriteMPSAD,  [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD.
@@ -219,6 +219,7 @@ def : WriteRes<WriteVecInsertLd, [SKLPor
   let Latency = 6;
   let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
 def : WriteRes<WriteVecExtract, [SKLPort0,SKLPort5]> {
   let Latency = 3;
@@ -339,10 +340,10 @@ def : WriteRes<WriteCLMulLd, [SKLPort5,
 def : WriteRes<WriteSystem,     [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : SKLWriteResPair<WriteFShuffle256,  [SKLPort5],  3>; // Fp 256-bit width vector shuffles.
-defm : SKLWriteResPair<WriteFVarShuffle256,  [SKLPort5],  3>; // Fp 256-bit width vector variable shuffles.
-defm : SKLWriteResPair<WriteShuffle256,  [SKLPort5],  3>;  // 256-bit width vector shuffles.
-defm : SKLWriteResPair<WriteVarShuffle256,  [SKLPort5],  3>;  // 256-bit width vector variable shuffles.
+defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
 defm : SKLWriteResPair<WriteVarVecShift,  [SKLPort0, SKLPort5], 2, [2, 1]>;  // Variable vector shifts.
 
 // Old microcoded instructions that nobody use.
@@ -1260,18 +1261,13 @@ def SKLWriteResGroup88 : SchedWriteRes<[
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup88], (instregex "(V?)INSERTPSrm",
-                                             "(V?)PACKSSDWrm",
+def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PACKSSDWrm",
                                              "(V?)PACKSSWBrm",
                                              "(V?)PACKUSDWrm",
                                              "(V?)PACKUSWBrm",
                                              "(V?)PALIGNRrmi",
                                              "VPBROADCASTBrm",
                                              "VPBROADCASTWrm",
-                                             "VPERMILPDmi",
-                                             "VPERMILPDrm",
-                                             "VPERMILPSmi",
-                                             "VPERMILPSrm",
                                              "(V?)PSHUFBrm",
                                              "(V?)PSHUFDmi",
                                              "(V?)PSHUFHWmi",
@@ -1283,13 +1279,7 @@ def: InstRW<[SKLWriteResGroup88], (instr
                                              "(V?)PUNPCKLBWrm",
                                              "(V?)PUNPCKLDQrm",
                                              "(V?)PUNPCKLQDQrm",
-                                             "(V?)PUNPCKLWDrm",
-                                             "(V?)SHUFPDrmi",
-                                             "(V?)SHUFPSrmi",
-                                             "(V?)UNPCKHPDrm",
-                                             "(V?)UNPCKHPSrm",
-                                             "(V?)UNPCKLPDrm",
-                                             "(V?)UNPCKLPSrm")>;
+                                             "(V?)PUNPCKLWDrm")>;
 
 def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> {
   let Latency = 7;
@@ -1514,8 +1504,6 @@ def: InstRW<[SKLWriteResGroup108], (inst
                                               "VPBLENDWYrmi",
                                               "VPBROADCASTBYrm",
                                               "VPBROADCASTWYrm",
-                                              "VPERMILPDYrm",
-                                              "VPERMILPSYrm",
                                               "VPMOVSXBDYrm",
                                               "VPMOVSXBQYrm",
                                               "VPMOVSXWQYrm",
@@ -1791,12 +1779,6 @@ def SKLWriteResGroup133 : SchedWriteRes<
 def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
                                               "ILD_F(16|32|64)m",
                                               "VPCMPGTQYrm",
-                                              "VPERM2F128rm",
-                                              "VPERM2I128rm",
-                                              "VPERMDYrm",
-                                              "VPERMPDYmi",
-                                              "VPERMPSYrm",
-                                              "VPERMQYmi",
                                               "VPMOVZXBDYrm",
                                               "VPMOVZXBQYrm",
                                               "VPMOVZXBWYrm",

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td Wed May  2 10:58:50 2018
@@ -172,10 +172,10 @@ defm : SKXWriteResPair<WriteFMAY, [SKXPo
 defm : SKXWriteResPair<WriteFSign,  [SKXPort0],  1>; // Floating point fabs/fchs.
 defm : SKXWriteResPair<WriteFLogic, [SKXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
 defm : SKXWriteResPair<WriteFLogicY, [SKXPort015], 1, [1], 1, 7>; // Floating point and/or/xor logicals (YMM/ZMM).
-defm : SKXWriteResPair<WriteFShuffle,  [SKXPort5], 1, [1], 1, 5>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFShuffle,  [SKXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
 defm : SKXWriteResPair<WriteFShuffleY, [SKXPort5], 1, [1], 1, 7>; // Floating point vector shuffles (YMM/ZMM).
-defm : SKXWriteResPair<WriteFVarShuffle,  [SKXPort5],  1>; // Floating point vector variable shuffles.
-defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5],  1>; // Floating point vector variable shuffles.
+defm : SKXWriteResPair<WriteFVarShuffle,  [SKXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5], 1, [1], 1, 7>; // Floating point vector variable shuffles.
 defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1, [1], 1, 6>; // Floating point vector blends.
 defm : SKXWriteResPair<WriteFBlendY,[SKXPort015], 1, [1], 1, 7>; // Floating point vector blends.
 defm : SKXWriteResPair<WriteFVarBlend, [SKXPort015], 2, [2], 2, 6>; // Fp vector variable blends.
@@ -201,8 +201,8 @@ defm : SKXWriteResPair<WriteVecLogicY,[S
 defm : SKXWriteResPair<WriteVecShift, [SKXPort0],  1>; // Vector integer shifts.
 defm : SKXWriteResPair<WriteVecIMul,  [SKXPort0],   5>; // Vector integer multiply.
 defm : SKXWriteResPair<WritePMULLD,   [SKXPort015], 10, [2], 2, 6>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteShuffle,  [SKXPort5],  1>; // Vector shuffles.
-defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5],  1>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
 defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends.
 defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends.
 defm : SKXWriteResPair<WriteMPSAD,  [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD.
@@ -219,6 +219,7 @@ def : WriteRes<WriteVecInsertLd, [SKXPor
   let Latency = 6;
   let NumMicroOps = 2;
 }
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
 
 def : WriteRes<WriteVecExtract, [SKXPort0,SKXPort5]> {
   let Latency = 3;
@@ -339,10 +340,10 @@ def : WriteRes<WriteCLMulLd, [SKXPort5,
 def : WriteRes<WriteSystem,     [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
 
 // AVX2.
-defm : SKXWriteResPair<WriteFShuffle256,  [SKXPort5],  3>; // Fp 256-bit width vector shuffles.
-defm : SKXWriteResPair<WriteFVarShuffle256,  [SKXPort5],  3>; // Fp 256-bit width vector variable shuffles.
-defm : SKXWriteResPair<WriteShuffle256,  [SKXPort5],  3>;  // 256-bit width vector shuffles.
-defm : SKXWriteResPair<WriteVarShuffle256,  [SKXPort5],  3>;  // 256-bit width vector variable shuffles.
+defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
 defm : SKXWriteResPair<WriteVarVecShift,  [SKXPort0, SKXPort5], 2, [2, 1]>; // Variable vector shifts.
 
 // Old microcoded instructions that nobody use.
@@ -1284,57 +1285,18 @@ def: InstRW<[SKXWriteResGroup32], (instr
                                              "VPMINUQZ128rr",
                                              "VPMINUQZ256rr",
                                              "VPMINUQZrr",
-                                             "VPMOVQDZ128rr",
-                                             "VPMOVQDZ256rr",
-                                             "VPMOVQDZrr",
                                              "VPMOVSXBDYrr",
-                                             "VPMOVSXBDZ128rr",
-                                             "VPMOVSXBDZ256rr",
-                                             "VPMOVSXBDZrr",
                                              "VPMOVSXBQYrr",
-                                             "VPMOVSXBQZ128rr",
-                                             "VPMOVSXBQZ256rr",
-                                             "VPMOVSXBQZrr",
                                              "VPMOVSXBWYrr",
-                                             "VPMOVSXBWZ128rr",
-                                             "VPMOVSXBWZ256rr",
-                                             "VPMOVSXBWZrr",
                                              "VPMOVSXDQYrr",
-                                             "VPMOVSXDQZ128rr",
-                                             "VPMOVSXDQZ256rr",
-                                             "VPMOVSXDQZrr",
                                              "VPMOVSXWDYrr",
-                                             "VPMOVSXWDZ128rr",
-                                             "VPMOVSXWDZ256rr",
-                                             "VPMOVSXWDZrr",
                                              "VPMOVSXWQYrr",
-                                             "VPMOVSXWQZ128rr",
-                                             "VPMOVSXWQZ256rr",
-                                             "VPMOVSXWQZrr",
                                              "VPMOVZXBDYrr",
-                                             "VPMOVZXBDZ128rr",
-                                             "VPMOVZXBDZ256rr",
-                                             "VPMOVZXBDZrr",
                                              "VPMOVZXBQYrr",
-                                             "VPMOVZXBQZ128rr",
-                                             "VPMOVZXBQZ256rr",
-                                             "VPMOVZXBQZrr",
                                              "VPMOVZXBWYrr",
-                                             "VPMOVZXBWZ128rr",
-                                             "VPMOVZXBWZ256rr",
-                                             "VPMOVZXBWZrr",
                                              "VPMOVZXDQYrr",
-                                             "VPMOVZXDQZ128rr",
-                                             "VPMOVZXDQZ256rr",
-                                             "VPMOVZXDQZrr",
                                              "VPMOVZXWDYrr",
-                                             "VPMOVZXWDZ128rr",
-                                             "VPMOVZXWDZ256rr",
-                                             "VPMOVZXWDZrr",
                                              "VPMOVZXWQYrr",
-                                             "VPMOVZXWQZ128rr",
-                                             "VPMOVZXWQZ256rr",
-                                             "VPMOVZXWQZrr",
                                              "VPSADBWZrr", // TODO: 512-bit ops require ports 0/1 to be joined.
                                              "VPTESTMBZ128rr",
                                              "VPTESTMBZ256rr",
@@ -2189,9 +2151,7 @@ def SKXWriteResGroup92 : SchedWriteRes<[
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)",
-                                             "(V?)INSERTPSrm",
-                                             "VMOVSDZrm(b?)",
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
                                              "VMOVSSZrm(b?)",
                                              "VPACKSSDWZ128rm(b?)",
                                              "(V?)PACKSSDWrm",
@@ -2207,14 +2167,6 @@ def: InstRW<[SKXWriteResGroup92], (instr
                                              "VPBROADCASTBrm",
                                              "VPBROADCASTWZ128m(b?)",
                                              "VPBROADCASTWrm",
-                                             "VPERMILPDZ128m(b?)i",
-                                             "VPERMILPDZ128rm(b?)",
-                                             "VPERMILPDmi",
-                                             "VPERMILPDrm",
-                                             "VPERMILPSZ128m(b?)i",
-                                             "VPERMILPSZ128rm(b?)",
-                                             "VPERMILPSmi",
-                                             "VPERMILPSrm",
                                              "VPSHUFBZ128rm(b?)",
                                              "(V?)PSHUFBrm",
                                              "VPSHUFDZ128m(b?)i",
@@ -2240,19 +2192,7 @@ def: InstRW<[SKXWriteResGroup92], (instr
                                              "VPUNPCKLQDQZ128rm(b?)",
                                              "(V?)PUNPCKLQDQrm",
                                              "VPUNPCKLWDZ128rm(b?)",
-                                             "(V?)PUNPCKLWDrm",
-                                             "VSHUFPDZ128rm(b?)i",
-                                             "(V?)SHUFPDrmi",
-                                             "VSHUFPSZ128rm(b?)i",
-                                             "(V?)SHUFPSrmi",
-                                             "VUNPCKHPDZ128rm(b?)",
-                                             "(V?)UNPCKHPDrm",
-                                             "VUNPCKHPSZ128rm(b?)",
-                                             "(V?)UNPCKHPSrm",
-                                             "VUNPCKLPDZ128rm(b?)",
-                                             "(V?)UNPCKLPDrm",
-                                             "VUNPCKLPSZ128rm(b?)",
-                                             "(V?)UNPCKLPSrm")>;
+                                             "(V?)PUNPCKLWDrm")>;
 
 def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 7;
@@ -2711,12 +2651,6 @@ def: InstRW<[SKXWriteResGroup119], (inst
                                               "VPBROADCASTWYrm",
                                               "VPBROADCASTWZ256m(b?)",
                                               "VPBROADCASTWZm(b?)",
-                                              "VPERMILPDYrm",
-                                              "VPERMILPDZ256rm(b?)",
-                                              "VPERMILPDZrm(b?)",
-                                              "VPERMILPSYrm",
-                                              "VPERMILPSZ256rm(b?)",
-                                              "VPERMILPSZrm(b?)",
                                               "VPMOVSXBDYrm",
                                               "VPMOVSXBQYrm",
                                               "VPMOVSXWQYrm",
@@ -3367,40 +3301,6 @@ def: InstRW<[SKXWriteResGroup148], (inst
                                               "VPCMPUWZrmi(b?)",
                                               "VPCMPWZ256rmi(b?)",
                                               "VPCMPWZrmi(b?)",
-                                              "VPERM2F128rm",
-                                              "VPERM2I128rm",
-                                              "VPERMDYrm",
-                                              "VPERMDZ256rm(b?)",
-                                              "VPERMDZrm(b?)",
-                                              "VPERMI2D256rm(b?)",
-                                              "VPERMI2Drm(b?)",
-                                              "VPERMI2PD256rm(b?)",
-                                              "VPERMI2PDrm(b?)",
-                                              "VPERMI2PS256rm(b?)",
-                                              "VPERMI2PSrm(b?)",
-                                              "VPERMI2Q256rm(b?)",
-                                              "VPERMI2Qrm(b?)",
-                                              "VPERMPDYmi",
-                                              "VPERMPDZ256m(b?)i",
-                                              "VPERMPDZ256rm(b?)",
-                                              "VPERMPDZm(b?)i",
-                                              "VPERMPDZrm(b?)",
-                                              "VPERMPSYrm",
-                                              "VPERMPSZ256rm(b?)",
-                                              "VPERMPSZrm(b?)",
-                                              "VPERMQYmi",
-                                              "VPERMQZ256m(b?)i",
-                                              "VPERMQZ256rm(b?)",
-                                              "VPERMQZm(b?)i",
-                                              "VPERMQZrm(b?)",
-                                              "VPERMT2D256rm(b?)",
-                                              "VPERMT2Drm(b?)",
-                                              "VPERMT2PD256rm(b?)",
-                                              "VPERMT2PDrm(b?)",
-                                              "VPERMT2PS256rm(b?)",
-                                              "VPERMT2PSrm(b?)",
-                                              "VPERMT2Q256rm(b?)",
-                                              "VPERMT2Qrm(b?)",
                                               "VPMAXSQZ256rm(b?)",
                                               "VPMAXSQZrm(b?)",
                                               "VPMAXUQZ256rm(b?)",
@@ -3409,35 +3309,11 @@ def: InstRW<[SKXWriteResGroup148], (inst
                                               "VPMINSQZrm(b?)",
                                               "VPMINUQZ256rm(b?)",
                                               "VPMINUQZrm(b?)",
-                                              "VPMOVSXBDZ256rm(b?)",
-                                              "VPMOVSXBDZrm(b?)",
-                                              "VPMOVSXBQZ256rm(b?)",
-                                              "VPMOVSXBQZrm(b?)",
-                                              "VPMOVSXBWZ256rm(b?)",
-                                              "VPMOVSXBWZrm(b?)",
-                                              "VPMOVSXDQZ256rm(b?)",
-                                              "VPMOVSXDQZrm(b?)",
-                                              "VPMOVSXWDZ256rm(b?)",
-                                              "VPMOVSXWDZrm(b?)",
-                                              "VPMOVSXWQZ256rm(b?)",
-                                              "VPMOVSXWQZrm(b?)",
                                               "VPMOVZXBDYrm",
-                                              "VPMOVZXBDZ256rm(b?)",
-                                              "VPMOVZXBDZrm(b?)",
                                               "VPMOVZXBQYrm",
-                                              "VPMOVZXBQZ256rm(b?)",
-                                              "VPMOVZXBQZrm(b?)",
                                               "VPMOVZXBWYrm",
-                                              "VPMOVZXBWZ256rm(b?)",
-                                              "VPMOVZXBWZrm(b?)",
                                               "VPMOVZXDQYrm",
-                                              "VPMOVZXDQZ256rm(b?)",
-                                              "VPMOVZXDQZrm(b?)",
-                                              "VPMOVZXWDZ256rm(b?)",
-                                              "VPMOVZXWDZrm(b?)",
                                               "VPMOVZXWQYrm",
-                                              "VPMOVZXWQZ256rm(b?)",
-                                              "VPMOVZXWQZrm(b?)",
                                               "VPSADBWYrm",
                                               "VPSADBWZ256rm(b?)",
                                               "VPSADBWZrm(b?)",
@@ -3456,15 +3332,7 @@ def: InstRW<[SKXWriteResGroup148], (inst
                                               "VPTESTNMQZ256rm(b?)",
                                               "VPTESTNMQZrm(b?)",
                                               "VPTESTNMWZ256rm(b?)",
-                                              "VPTESTNMWZrm(b?)",
-                                              "VSHUFF32X4Z256rm(b?)i",
-                                              "VSHUFF32X4Zrm(b?)i",
-                                              "VSHUFF64X2Z256rm(b?)i",
-                                              "VSHUFF64X2Zrm(b?)i",
-                                              "VSHUFI32X4Z256rm(b?)i",
-                                              "VSHUFI32X4Zrm(b?)i",
-                                              "VSHUFI64X2Z256rm(b?)i",
-                                              "VSHUFI64X2Zrm(b?)i")>;
+                                              "VPTESTNMWZrm(b?)")>;
 
 def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 10;

Modified: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-schedule.ll?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll Wed May  2 10:58:50 2018
@@ -523,7 +523,7 @@ define <8 x i32> @test_inserti128(<8 x i
 ; GENERIC-LABEL: test_inserti128:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; GENERIC-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2506,7 +2506,7 @@ define <4 x i64> @test_perm2i128(<4 x i6
 ; GENERIC-LABEL: test_perm2i128:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2555,7 +2555,7 @@ define <8 x i32> @test_permd(<8 x i32> %
 ; GENERIC-LABEL: test_permd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2605,7 +2605,7 @@ define <4 x double> @test_permpd(<4 x do
 ; GENERIC-LABEL: test_permpd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2654,7 +2654,7 @@ define <8 x float> @test_permps(<8 x i32
 ; GENERIC-LABEL: test_permps:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2704,7 +2704,7 @@ define <4 x i64> @test_permq(<4 x i64> %
 ; GENERIC-LABEL: test_permq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Wed May  2 10:58:50 2018
@@ -2957,7 +2957,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x8mem_to_8x16:
@@ -2977,7 +2977,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x8mem_to_8x16:
@@ -2998,7 +2998,7 @@ define <16 x i16> @zext_16x8mem_to_16x16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_16x8mem_to_16x16:
@@ -3018,7 +3018,7 @@ define <16 x i16> @sext_16x8mem_to_16x16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_16x8mem_to_16x16:
@@ -3104,7 +3104,7 @@ define <32 x i16> @zext_32x8mem_to_32x16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_32x8mem_to_32x16:
@@ -3124,7 +3124,7 @@ define <32 x i16> @sext_32x8mem_to_32x16
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_32x8mem_to_32x16:
@@ -3210,7 +3210,7 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_4x8mem_to_4x32:
@@ -3230,7 +3230,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_4x8mem_to_4x32:
@@ -3250,7 +3250,7 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x8mem_to_8x32:
@@ -3270,7 +3270,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x8mem_to_8x32:
@@ -3290,7 +3290,7 @@ define <16 x i32> @zext_16x8mem_to_16x32
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_16x8mem_to_16x32:
@@ -3310,7 +3310,7 @@ define <16 x i32> @sext_16x8mem_to_16x32
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_16x8mem_to_16x32:
@@ -3396,7 +3396,7 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_2x8mem_to_2x64:
@@ -3415,7 +3415,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mas
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_2x8mem_to_2x64mask:
@@ -3449,7 +3449,7 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_4x8mem_to_4x64:
@@ -3469,7 +3469,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mas
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_4x8mem_to_4x64mask:
@@ -3504,7 +3504,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x8mem_to_8x64:
@@ -3524,7 +3524,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mas
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x8mem_to_8x64mask:
@@ -3542,7 +3542,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mas
 define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
 ; GENERIC-LABEL: sext_8x8mem_to_8x64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x8mem_to_8x64:
@@ -3559,7 +3559,7 @@ define <4 x i32> @zext_4x16mem_to_4x32(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_4x16mem_to_4x32:
@@ -3579,7 +3579,7 @@ define <4 x i32> @sext_4x16mem_to_4x32ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_4x16mem_to_4x32mask:
@@ -3615,7 +3615,7 @@ define <8 x i32> @zext_8x16mem_to_8x32(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x16mem_to_8x32:
@@ -3635,7 +3635,7 @@ define <8 x i32> @sext_8x16mem_to_8x32ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x16mem_to_8x32mask:
@@ -3703,7 +3703,7 @@ define <16 x i32> @zext_16x16mem_to_16x3
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_16x16mem_to_16x32:
@@ -3723,7 +3723,7 @@ define <16 x i32> @sext_16x16mem_to_16x3
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovb2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_16x16mem_to_16x32mask:
@@ -3741,7 +3741,7 @@ define <16 x i32> @sext_16x16mem_to_16x3
 define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
 ; GENERIC-LABEL: sext_16x16mem_to_16x32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpmovsxwd (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwd (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_16x16mem_to_16x32:
@@ -3790,7 +3790,7 @@ define <2 x i64> @zext_2x16mem_to_2x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_2x16mem_to_2x64:
@@ -3810,7 +3810,7 @@ define <2 x i64> @sext_2x16mem_to_2x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_2x16mem_to_2x64mask:
@@ -3845,7 +3845,7 @@ define <4 x i64> @zext_4x16mem_to_4x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_4x16mem_to_4x64:
@@ -3865,7 +3865,7 @@ define <4 x i64> @sext_4x16mem_to_4x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_4x16mem_to_4x64mask:
@@ -3900,7 +3900,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x16mem_to_8x64:
@@ -3920,7 +3920,7 @@ define <8 x i64> @sext_8x16mem_to_8x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x16mem_to_8x64mask:
@@ -3938,7 +3938,7 @@ define <8 x i64> @sext_8x16mem_to_8x64ma
 define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
 ; GENERIC-LABEL: sext_8x16mem_to_8x64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpmovsxwq (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxwq (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x16mem_to_8x64:
@@ -3988,7 +3988,7 @@ define <2 x i64> @zext_2x32mem_to_2x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_2x32mem_to_2x64:
@@ -4008,7 +4008,7 @@ define <2 x i64> @sext_2x32mem_to_2x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovq2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_2x32mem_to_2x64mask:
@@ -4043,7 +4043,7 @@ define <4 x i64> @zext_4x32mem_to_4x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_4x32mem_to_4x64:
@@ -4063,7 +4063,7 @@ define <4 x i64> @sext_4x32mem_to_4x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovd2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_4x32mem_to_4x64mask:
@@ -4131,7 +4131,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; GENERIC-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: zext_8x32mem_to_8x64:
@@ -4151,7 +4151,7 @@ define <8 x i64> @sext_8x32mem_to_8x64ma
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovw2m %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x32mem_to_8x64mask:
@@ -4169,7 +4169,7 @@ define <8 x i64> @sext_8x32mem_to_8x64ma
 define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
 ; GENERIC-LABEL: sext_8x32mem_to_8x64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpmovsxdq (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxdq (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: sext_8x32mem_to_8x64:
@@ -4473,7 +4473,7 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32
 define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
 ; GENERIC-LABEL: extload_v8i64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpmovsxbq (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -8258,7 +8258,7 @@ define   <16 x float> @_ss16xfloat_maskz
 define   <16 x float> @_ss16xfloat_load(float* %a.ptr) {
 ; GENERIC-LABEL: _ss16xfloat_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_load:
@@ -8275,7 +8275,7 @@ define   <16 x float> @_ss16xfloat_mask_
 ; GENERIC-LABEL: _ss16xfloat_mask_load:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_mask_load:
@@ -8295,7 +8295,7 @@ define   <16 x float> @_ss16xfloat_maskz
 ; GENERIC-LABEL: _ss16xfloat_maskz_load:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_maskz_load:
@@ -8369,7 +8369,7 @@ define   <8 x double> @_sd8xdouble_maskz
 define   <8 x double> @_sd8xdouble_load(double* %a.ptr) {
 ; GENERIC-LABEL: _sd8xdouble_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_load:
@@ -8386,7 +8386,7 @@ define   <8 x double> @_sd8xdouble_mask_
 ; GENERIC-LABEL: _sd8xdouble_mask_load:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_mask_load:
@@ -8406,7 +8406,7 @@ define   <8 x double> @_sd8xdouble_maskz
 ; GENERIC-LABEL: _sd8xdouble_maskz_load:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_maskz_load:
@@ -8700,7 +8700,7 @@ define <16 x float> @broadcast_ss_spill(
 ; GENERIC-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
 ; GENERIC-NEXT:    callq func_f32
-; GENERIC-NEXT:    vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:1.00]
 ; GENERIC-NEXT:    addq $24, %rsp # sched: [1:0.33]
 ; GENERIC-NEXT:    .cfi_def_cfa_offset 8
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -8732,7 +8732,7 @@ define <8 x double> @broadcast_sd_spill(
 ; GENERIC-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
 ; GENERIC-NEXT:    callq func_f64
-; GENERIC-NEXT:    vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [6:1.00]
+; GENERIC-NEXT:    vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:1.00]
 ; GENERIC-NEXT:    addq $24, %rsp # sched: [1:0.33]
 ; GENERIC-NEXT:    .cfi_def_cfa_offset 8
 ; GENERIC-NEXT:    retq # sched: [1:1.00]

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Wed May  2 10:58:50 2018
@@ -202,7 +202,7 @@ define <16 x i16> @test_16xi16_perm_mem_
 ; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi16_perm_mem_mask0:
@@ -219,7 +219,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
@@ -240,7 +240,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
@@ -261,7 +261,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
@@ -282,7 +282,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
@@ -303,7 +303,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
@@ -324,7 +324,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
@@ -344,7 +344,7 @@ define <16 x i16> @test_16xi16_perm_mem_
 ; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi16_perm_mem_mask3:
@@ -361,7 +361,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
@@ -382,7 +382,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
@@ -596,7 +596,7 @@ define <32 x i16> @test_32xi16_perm_mem_
 ; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi16_perm_mem_mask0:
@@ -613,7 +613,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
@@ -634,7 +634,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
@@ -655,7 +655,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
@@ -676,7 +676,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
@@ -697,7 +697,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
@@ -718,7 +718,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
@@ -738,7 +738,7 @@ define <32 x i16> @test_32xi16_perm_mem_
 ; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi16_perm_mem_mask3:
@@ -755,7 +755,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
@@ -776,7 +776,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
@@ -990,7 +990,7 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 ; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_perm_mem_mask0:
@@ -1007,7 +1007,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
@@ -1028,7 +1028,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
@@ -1049,7 +1049,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
@@ -1070,7 +1070,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
@@ -1091,7 +1091,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
@@ -1112,7 +1112,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
@@ -1132,7 +1132,7 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 ; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_perm_mem_mask3:
@@ -1149,7 +1149,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
@@ -1170,7 +1170,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
@@ -1384,7 +1384,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 ; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_perm_mem_mask0:
@@ -1401,7 +1401,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
@@ -1422,7 +1422,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
@@ -1443,7 +1443,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
@@ -1464,7 +1464,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
@@ -1485,7 +1485,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
@@ -1506,7 +1506,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
@@ -1526,7 +1526,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 ; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_perm_mem_mask3:
@@ -1543,7 +1543,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
@@ -1564,7 +1564,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
@@ -1757,7 +1757,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
 ; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_perm_mem_mask0:
@@ -1772,7 +1772,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
@@ -1791,7 +1791,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
@@ -1810,7 +1810,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
@@ -1829,7 +1829,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
@@ -1848,7 +1848,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
@@ -1867,7 +1867,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
@@ -1885,7 +1885,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
 ; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_perm_mem_mask3:
@@ -1900,7 +1900,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
@@ -1919,7 +1919,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
@@ -2293,7 +2293,7 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 ; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_perm_mem_mask0:
@@ -2310,7 +2310,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
@@ -2331,7 +2331,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
@@ -2351,7 +2351,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
@@ -2370,7 +2370,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
@@ -2390,7 +2390,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
@@ -2411,7 +2411,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
@@ -2430,7 +2430,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
 ; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
@@ -2445,7 +2445,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
@@ -2464,7 +2464,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
@@ -2484,7 +2484,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
@@ -2505,7 +2505,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
@@ -2525,7 +2525,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
@@ -2544,7 +2544,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
@@ -2563,7 +2563,7 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 ; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_perm_mem_mask6:
@@ -2580,7 +2580,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
@@ -2601,7 +2601,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
@@ -2621,7 +2621,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
@@ -2640,7 +2640,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
@@ -2853,7 +2853,7 @@ define <8 x float> @test_8xfloat_perm_me
 ; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_perm_mem_mask0:
@@ -2870,7 +2870,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
@@ -2891,7 +2891,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
@@ -2912,7 +2912,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
@@ -2933,7 +2933,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
@@ -2954,7 +2954,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
@@ -2975,7 +2975,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
@@ -2995,7 +2995,7 @@ define <8 x float> @test_8xfloat_perm_me
 ; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_perm_mem_mask3:
@@ -3012,7 +3012,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
@@ -3033,7 +3033,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
@@ -3247,7 +3247,7 @@ define <16 x float> @test_16xfloat_perm_
 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_perm_mem_mask0:
@@ -3264,7 +3264,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
@@ -3285,7 +3285,7 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
@@ -3306,7 +3306,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
@@ -3327,7 +3327,7 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
@@ -3348,7 +3348,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
@@ -3369,7 +3369,7 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
@@ -3389,7 +3389,7 @@ define <16 x float> @test_16xfloat_perm_
 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_perm_mem_mask3:
@@ -3406,7 +3406,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
@@ -3427,7 +3427,7 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
@@ -3620,7 +3620,7 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
 ; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_perm_mem_mask0:
@@ -3635,7 +3635,7 @@ define <4 x double> @test_masked_4xdoubl
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
@@ -3654,7 +3654,7 @@ define <4 x double> @test_masked_z_4xdou
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
@@ -3673,7 +3673,7 @@ define <4 x double> @test_masked_4xdoubl
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
@@ -3692,7 +3692,7 @@ define <4 x double> @test_masked_z_4xdou
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
@@ -3711,7 +3711,7 @@ define <4 x double> @test_masked_4xdoubl
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
@@ -3730,7 +3730,7 @@ define <4 x double> @test_masked_z_4xdou
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
@@ -3748,7 +3748,7 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
 ; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_perm_mem_mask3:
@@ -3763,7 +3763,7 @@ define <4 x double> @test_masked_4xdoubl
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
@@ -3782,7 +3782,7 @@ define <4 x double> @test_masked_z_4xdou
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
@@ -4156,7 +4156,7 @@ define <8 x double> @test_8xdouble_perm_
 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_perm_mem_mask0:
@@ -4173,7 +4173,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
@@ -4194,7 +4194,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
@@ -4214,7 +4214,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
@@ -4233,7 +4233,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
@@ -4253,7 +4253,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
@@ -4274,7 +4274,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
@@ -4293,7 +4293,7 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
 ; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
@@ -4308,7 +4308,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
@@ -4327,7 +4327,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
@@ -4347,7 +4347,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
@@ -4368,7 +4368,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
@@ -4388,7 +4388,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
@@ -4407,7 +4407,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
@@ -4426,7 +4426,7 @@ define <8 x double> @test_8xdouble_perm_
 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_perm_mem_mask6:
@@ -4443,7 +4443,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
@@ -4464,7 +4464,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
@@ -4484,7 +4484,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
@@ -4503,7 +4503,7 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
@@ -8941,7 +8941,7 @@ define <8 x float> @test_8xfloat_masked_
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -8962,7 +8962,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
@@ -8981,7 +8981,7 @@ define <8 x float> @test_8xfloat_masked_
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9002,7 +9002,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
@@ -9021,7 +9021,7 @@ define <8 x float> @test_8xfloat_masked_
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9042,7 +9042,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
@@ -9075,7 +9075,7 @@ define <8 x float> @test_8xfloat_masked_
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9096,7 +9096,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
@@ -9288,7 +9288,7 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
@@ -9303,7 +9303,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9324,7 +9324,7 @@ define <16 x float> @test_16xfloat_zero_
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
@@ -9343,7 +9343,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9364,7 +9364,7 @@ define <16 x float> @test_16xfloat_zero_
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
@@ -9383,7 +9383,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9404,7 +9404,7 @@ define <16 x float> @test_16xfloat_zero_
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
@@ -9422,7 +9422,7 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
@@ -9437,7 +9437,7 @@ define <16 x float> @test_16xfloat_maske
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9458,7 +9458,7 @@ define <16 x float> @test_16xfloat_zero_
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
@@ -9665,7 +9665,7 @@ define <4 x double> @test_4xdouble_maske
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9686,7 +9686,7 @@ define <4 x double> @test_4xdouble_zero_
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
@@ -9705,7 +9705,7 @@ define <4 x double> @test_4xdouble_maske
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9726,7 +9726,7 @@ define <4 x double> @test_4xdouble_zero_
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
@@ -9745,7 +9745,7 @@ define <4 x double> @test_4xdouble_maske
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9766,7 +9766,7 @@ define <4 x double> @test_4xdouble_zero_
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
@@ -9799,7 +9799,7 @@ define <4 x double> @test_4xdouble_maske
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -9820,7 +9820,7 @@ define <4 x double> @test_4xdouble_zero_
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
@@ -10012,7 +10012,7 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
@@ -10027,7 +10027,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10048,7 +10048,7 @@ define <8 x double> @test_8xdouble_zero_
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
@@ -10067,7 +10067,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10088,7 +10088,7 @@ define <8 x double> @test_8xdouble_zero_
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
@@ -10107,7 +10107,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10128,7 +10128,7 @@ define <8 x double> @test_8xdouble_zero_
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
@@ -10146,7 +10146,7 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
@@ -10161,7 +10161,7 @@ define <8 x double> @test_8xdouble_maske
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10182,7 +10182,7 @@ define <8 x double> @test_8xdouble_zero_
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
@@ -10374,7 +10374,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_shuff_mem_mask0:
@@ -10389,7 +10389,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10410,7 +10410,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
@@ -10429,7 +10429,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10450,7 +10450,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
@@ -10469,7 +10469,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10490,7 +10490,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
@@ -10508,7 +10508,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_shuff_mem_mask3:
@@ -10523,7 +10523,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10544,7 +10544,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
@@ -10736,7 +10736,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
 ; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_shuff_mem_mask0:
@@ -10751,7 +10751,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10772,7 +10772,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
@@ -10791,7 +10791,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10812,7 +10812,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
@@ -10831,7 +10831,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10852,7 +10852,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
@@ -10870,7 +10870,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
 ; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_shuff_mem_mask3:
@@ -10885,7 +10885,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -10906,7 +10906,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
@@ -11098,7 +11098,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_shuff_mem_mask0:
@@ -11113,7 +11113,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11134,7 +11134,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
@@ -11153,7 +11153,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11174,7 +11174,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
@@ -11193,7 +11193,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11214,7 +11214,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
@@ -11232,7 +11232,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_shuff_mem_mask3:
@@ -11247,7 +11247,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11268,7 +11268,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
@@ -11460,7 +11460,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
 ; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_shuff_mem_mask0:
@@ -11475,7 +11475,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11496,7 +11496,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
@@ -11515,7 +11515,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11536,7 +11536,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
@@ -11555,7 +11555,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11576,7 +11576,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
@@ -11594,7 +11594,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
 ; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_shuff_mem_mask3:
@@ -11609,7 +11609,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11630,7 +11630,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00]
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
@@ -11837,7 +11837,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11858,7 +11858,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
@@ -11877,7 +11877,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11898,7 +11898,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
@@ -11917,7 +11917,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11938,7 +11938,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
@@ -11971,7 +11971,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -11992,7 +11992,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
@@ -12836,7 +12836,7 @@ define <2 x double> @test_2xdouble_maske
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -12857,7 +12857,7 @@ define <2 x double> @test_2xdouble_zero_
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
@@ -12876,7 +12876,7 @@ define <2 x double> @test_2xdouble_maske
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -12897,7 +12897,7 @@ define <2 x double> @test_2xdouble_zero_
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [6:1.00]
+; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
@@ -13828,7 +13828,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13849,7 +13849,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
@@ -13868,7 +13868,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13889,7 +13889,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
@@ -13908,7 +13908,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13929,7 +13929,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
@@ -13962,7 +13962,7 @@ define <4 x float> @test_4xfloat_masked_
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -13983,7 +13983,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
@@ -14827,7 +14827,7 @@ define <2 x double> @test_2xdouble_maske
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -14848,7 +14848,7 @@ define <2 x double> @test_2xdouble_zero_
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
@@ -14867,7 +14867,7 @@ define <2 x double> @test_2xdouble_maske
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -14888,7 +14888,7 @@ define <2 x double> @test_2xdouble_zero_
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [6:1.00]
+; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:

Modified: llvm/trunk/test/CodeGen/X86/xop-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xop-schedule.ll?rev=331380&r1=331379&r2=331380&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xop-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xop-schedule.ll Wed May  2 10:58:50 2018
@@ -212,8 +212,8 @@ define void @test_vpermil2pd_128(<2 x do
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; GENERIC-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -234,8 +234,8 @@ define void @test_vpermil2pd_256(<4 x do
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; GENERIC-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -258,8 +258,8 @@ define void @test_vpermil2ps_128(<4 x fl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; GENERIC-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -280,8 +280,8 @@ define void @test_vpermil2ps_256(<8 x fl
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; GENERIC-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]




More information about the llvm-commits mailing list