[llvm] r331386 - [X86] Split WriteShuffle/WriteVarShuffle + WriteBlend/WriteVarBlend into XMM and YMM/ZMM scheduler classes

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed May 2 11:48:24 PDT 2018


Author: rksimon
Date: Wed May  2 11:48:23 2018
New Revision: 331386

URL: http://llvm.org/viewvc/llvm-project?rev=331386&view=rev
Log:
[X86] Split WriteShuffle/WriteVarShuffle + WriteBlend/WriteVarBlend into XMM and YMM/ZMM scheduler classes

Modified:
    llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
    llvm/trunk/lib/Target/X86/X86SchedHaswell.td
    llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
    llvm/trunk/lib/Target/X86/X86Schedule.td
    llvm/trunk/lib/Target/X86/X86ScheduleAtom.td
    llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
    llvm/trunk/lib/Target/X86/X86ScheduleSLM.td
    llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td
    llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
    llvm/trunk/test/CodeGen/X86/xop-schedule.ll

Modified: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedBroadwell.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td Wed May  2 11:48:23 2018
@@ -206,9 +206,13 @@ defm : BWWriteResPair<WriteVecShift, [BW
 defm : BWWriteResPair<WriteVecIMul,  [BWPort0],   5>; // Vector integer multiply.
 defm : BWWriteResPair<WritePMULLD,   [BWPort0], 10, [2], 2, 5>; // PMULLD
 defm : BWWriteResPair<WriteShuffle,  [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM).
 defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
-defm : BWWriteResPair<WriteBlend,  [BWPort5],  1>; // Vector blends.
+defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM).
+defm : BWWriteResPair<WriteBlend,  [BWPort5], 1, [1], 1, 5>; // Vector blends.
+defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
 defm : BWWriteResPair<WriteVarBlend,  [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
+defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
 defm : BWWriteResPair<WriteMPSAD,  [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD.
 defm : BWWriteResPair<WritePSADBW,   [BWPort0],   5>; // Vector PSADBW.
 defm : BWWriteResPair<WritePHMINPOS, [BWPort0],   5>; // Vector PHMINPOS.
@@ -1079,30 +1083,6 @@ def: InstRW<[BWWriteResGroup74], (instre
                                             "FCOMP32m",
                                             "FCOMP64m")>;
 
-def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm",
-                                            "VPACKSSWBYrm",
-                                            "VPACKUSDWYrm",
-                                            "VPACKUSWBYrm",
-                                            "VPALIGNRYrmi",
-                                            "VPBLENDWYrmi",
-                                            "VPSHUFBYrm",
-                                            "VPSHUFDYmi",
-                                            "VPSHUFHWYmi",
-                                            "VPSHUFLWYmi",
-                                            "VPUNPCKHBWYrm",
-                                            "VPUNPCKHDQYrm",
-                                            "VPUNPCKHQDQYrm",
-                                            "VPUNPCKHWDYrm",
-                                            "VPUNPCKLBWYrm",
-                                            "VPUNPCKLDQYrm",
-                                            "VPUNPCKLQDQYrm",
-                                            "VPUNPCKLWDYrm")>;
-
 def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> {
   let Latency = 7;
   let NumMicroOps = 2;
@@ -1293,7 +1273,6 @@ def BWWriteResGroup94 : SchedWriteRes<[B
 }
 def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm",
                                             "VMASKMOVPSYrm",
-                                            "VPBLENDVBYrm",
                                             "VPMASKMOVDYrm",
                                             "VPMASKMOVQYrm")>;
 

Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Wed May  2 11:48:23 2018
@@ -201,11 +201,15 @@ defm : HWWriteResPair<WriteVecALU,   [HW
 defm : HWWriteResPair<WriteVecIMul,  [HWPort0],   5>;
 defm : HWWriteResPair<WritePMULLD,   [HWPort0], 10, [2], 2, 6>;
 defm : HWWriteResPair<WriteShuffle,  [HWPort5],  1, [1], 1, 5>;
-defm : HWWriteResPair<WriteVarShuffle,[HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteShuffleY, [HWPort5],  1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteBlend,  [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
 defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
 defm : HWWriteResPair<WriteVarBlend,  [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>;
 defm : HWWriteResPair<WriteVarVecShift,  [HWPort0, HWPort5], 2, [2, 1]>;
 defm : HWWriteResPair<WriteMPSAD,  [HWPort0, HWPort5], 7, [1, 2], 3, 6>;
 defm : HWWriteResPair<WritePSADBW, [HWPort0], 5>;
@@ -880,7 +884,6 @@ def: InstRW<[HWWriteResGroup13], (instre
                                             "(V?)PACKUSDWrm",
                                             "(V?)PACKUSWBrm",
                                             "(V?)PALIGNRrmi",
-                                            "(V?)PSHUFBrm",
                                             "(V?)PSHUFDmi",
                                             "(V?)PSHUFHWmi",
                                             "(V?)PSHUFLWmi",
@@ -898,27 +901,16 @@ def HWWriteResGroup13_1 : SchedWriteRes<
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm",
-                                              "VPACKSSWBYrm",
-                                              "VPACKUSDWYrm",
-                                              "VPACKUSWBYrm",
-                                              "VPALIGNRYrmi",
-                                              "VPBLENDWYrmi",
-                                              "VPMOVSXBDYrm",
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm",
                                               "VPMOVSXBQYrm",
-                                              "VPMOVSXWQYrm",
-                                              "VPSHUFBYrm",
-                                              "VPSHUFDYmi",
-                                              "VPSHUFHWYmi",
-                                              "VPSHUFLWYmi",
-                                              "VPUNPCKHBWYrm",
-                                              "VPUNPCKHDQYrm",
-                                              "VPUNPCKHQDQYrm",
-                                              "VPUNPCKHWDYrm",
-                                              "VPUNPCKLBWYrm",
-                                              "VPUNPCKLDQYrm",
-                                              "VPUNPCKLQDQYrm",
-                                              "VPUNPCKLWDYrm")>;
+                                              "VPMOVSXWQYrm")>;
+
+def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm")>;
 
 def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
   let Latency = 6;
@@ -1244,7 +1236,6 @@ def HWWriteResGroup36_1 : SchedWriteRes<
 }
 def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm",
                                               "VMASKMOVPSYrm",
-                                              "VPBLENDVBYrm",
                                               "VPMASKMOVDYrm",
                                               "VPMASKMOVQYrm")>;
 

Modified: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td Wed May  2 11:48:23 2018
@@ -181,9 +181,13 @@ defm : SBWriteResPair<WriteVecALU,   [SB
 defm : SBWriteResPair<WriteVecIMul,  [SBPort0], 5>;
 defm : SBWriteResPair<WritePMULLD,   [SBPort0], 5, [1], 1, 6>; // TODO this is probably wrong for 256/512-bit for the "generic" model
 defm : SBWriteResPair<WriteShuffle,  [SBPort5], 1, [1], 1, 5>;
-defm : SBWriteResPair<WriteVarShuffle,  [SBPort15], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffle,  [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>;
 defm : SBWriteResPair<WriteBlend,   [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteBlendY,  [SBPort15], 1, [1], 1, 7>;
 defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>;
 defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>;
 defm : SBWriteResPair<WritePSADBW,  [SBPort0], 5>;
 defm : SBWriteResPair<WritePHMINPOS,  [SBPort0], 5, [1], 1, 6>;
@@ -884,6 +888,7 @@ def SBWriteResGroup51 : SchedWriteRes<[S
 }
 def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm",
                                             "MMX_PALIGNRrmi",
+                                            "MMX_PSHUFBrm",
                                             "MMX_PSIGN(B|D|W)rm")>;
 
 def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> {
@@ -995,7 +1000,6 @@ def: InstRW<[SBWriteResGroup59], (instre
                                             "(V?)PMOVZXDQrm",
                                             "(V?)PMOVZXWDrm",
                                             "(V?)PMOVZXWQrm",
-                                            "(V?)PSHUFBrm",
                                             "(V?)PSHUFDmi",
                                             "(V?)PSHUFHWmi",
                                             "(V?)PSHUFLWmi",

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td Wed May  2 11:48:23 2018
@@ -202,9 +202,13 @@ defm : SKLWriteResPair<WriteVecShift, [S
 defm : SKLWriteResPair<WriteVecIMul,  [SKLPort0],   5>; // Vector integer multiply.
 defm : SKLWriteResPair<WritePMULLD,   [SKLPort01], 10, [2], 2, 6>;
 defm : SKLWriteResPair<WriteShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
-defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM).
+defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5], 1, [1], 1, 6>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM).
 defm : SKLWriteResPair<WriteBlend,  [SKLPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>; // Vector blends (YMM/ZMM).
 defm : SKLWriteResPair<WriteVarBlend,  [SKLPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
 defm : SKLWriteResPair<WriteMPSAD,  [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD.
 defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3>; // Vector PSADBW.
 defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
@@ -1268,7 +1272,6 @@ def: InstRW<[SKLWriteResGroup88], (instr
                                              "(V?)PALIGNRrmi",
                                              "VPBROADCASTBrm",
                                              "VPBROADCASTWrm",
-                                             "(V?)PSHUFBrm",
                                              "(V?)PSHUFDmi",
                                              "(V?)PSHUFHWmi",
                                              "(V?)PSHUFLWmi",
@@ -1281,6 +1284,13 @@ def: InstRW<[SKLWriteResGroup88], (instr
                                              "(V?)PUNPCKLQDQrm",
                                              "(V?)PUNPCKLWDrm")>;
 
+def SKLWriteResGroup88a : SchedWriteRes<[SKLPort5,SKLPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup88a], (instregex "MMX_PSHUFBrm")>;
+
 def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> {
   let Latency = 7;
   let NumMicroOps = 2;
@@ -1496,29 +1506,11 @@ def: InstRW<[SKLWriteResGroup108], (inst
                                               "FCOM64m",
                                               "FCOMP32m",
                                               "FCOMP64m",
-                                              "VPACKSSDWYrm",
-                                              "VPACKSSWBYrm",
-                                              "VPACKUSDWYrm",
-                                              "VPACKUSWBYrm",
-                                              "VPALIGNRYrmi",
-                                              "VPBLENDWYrmi",
                                               "VPBROADCASTBYrm",
                                               "VPBROADCASTWYrm",
                                               "VPMOVSXBDYrm",
                                               "VPMOVSXBQYrm",
-                                              "VPMOVSXWQYrm",
-                                              "VPSHUFBYrm",
-                                              "VPSHUFDYmi",
-                                              "VPSHUFHWYmi",
-                                              "VPSHUFLWYmi",
-                                              "VPUNPCKHBWYrm",
-                                              "VPUNPCKHDQYrm",
-                                              "VPUNPCKHQDQYrm",
-                                              "VPUNPCKHWDYrm",
-                                              "VPUNPCKLBWYrm",
-                                              "VPUNPCKLDQYrm",
-                                              "VPUNPCKLQDQYrm",
-                                              "VPUNPCKLWDYrm")>;
+                                              "VPMOVSXWQYrm")>;
 
 def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> {
   let Latency = 8;

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td Wed May  2 11:48:23 2018
@@ -202,9 +202,13 @@ defm : SKXWriteResPair<WriteVecShift, [S
 defm : SKXWriteResPair<WriteVecIMul,  [SKXPort0],   5>; // Vector integer multiply.
 defm : SKXWriteResPair<WritePMULLD,   [SKXPort015], 10, [2], 2, 6>; // Vector integer multiply.
 defm : SKXWriteResPair<WriteShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector shuffles.
-defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM).
+defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5], 1, [1], 1, 6>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>; // Vector variable shuffles (YMM/ZMM).
 defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>; // Vector blends (YMM/ZMM).
 defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
 defm : SKXWriteResPair<WriteMPSAD,  [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD.
 defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 6>; // Vector PSADBW.
 defm : SKXWriteResPair<WritePHMINPOS, [SKXPort015], 4, [1], 1, 6>; // Vector PHMINPOS.
@@ -2167,8 +2171,6 @@ def: InstRW<[SKXWriteResGroup92], (instr
                                              "VPBROADCASTBrm",
                                              "VPBROADCASTWZ128m(b?)",
                                              "VPBROADCASTWrm",
-                                             "VPSHUFBZ128rm(b?)",
-                                             "(V?)PSHUFBrm",
                                              "VPSHUFDZ128m(b?)i",
                                              "(V?)PSHUFDmi",
                                              "VPSHUFHWZ128mi(b?)",
@@ -2194,6 +2196,13 @@ def: InstRW<[SKXWriteResGroup92], (instr
                                              "VPUNPCKLWDZ128rm(b?)",
                                              "(V?)PUNPCKLWDrm")>;
 
+def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup92a], (instregex "MMX_PSHUFBrm")>;
+
 def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 7;
   let NumMicroOps = 2;
@@ -2629,22 +2638,6 @@ def: InstRW<[SKXWriteResGroup119], (inst
                                               "FCOMP64m",
                                               "MMX_PSADBWirm",
                                               "VFPCLASSSDrm(b?)",
-                                              "VPACKSSDWYrm",
-                                              "VPACKSSDWZ256rm(b?)",
-                                              "VPACKSSDWZrm(b?)",
-                                              "VPACKSSWBYrm",
-                                              "VPACKSSWBZ256rm(b?)",
-                                              "VPACKSSWBZrm(b?)",
-                                              "VPACKUSDWYrm",
-                                              "VPACKUSDWZ256rm(b?)",
-                                              "VPACKUSDWZrm(b?)",
-                                              "VPACKUSWBYrm",
-                                              "VPACKUSWBZ256rm(b?)",
-                                              "VPACKUSWBZrm(b?)",
-                                              "VPALIGNRYrmi",
-                                              "VPALIGNRZ256rmi(b?)",
-                                              "VPALIGNRZrmi(b?)",
-                                              "VPBLENDWYrmi",
                                               "VPBROADCASTBYrm",
                                               "VPBROADCASTBZ256m(b?)",
                                               "VPBROADCASTBZm(b?)",
@@ -2653,47 +2646,7 @@ def: InstRW<[SKXWriteResGroup119], (inst
                                               "VPBROADCASTWZm(b?)",
                                               "VPMOVSXBDYrm",
                                               "VPMOVSXBQYrm",
-                                              "VPMOVSXWQYrm",
-                                              "VPSHUFBYrm",
-                                              "VPSHUFBZ256rm(b?)",
-                                              "VPSHUFBZrm(b?)",
-                                              "VPSHUFDYmi",
-                                              "VPSHUFDZ256m(b?)i",
-                                              "VPSHUFDZm(b?)i",
-                                              "VPSHUFHWYmi",
-                                              "VPSHUFHWZ256mi(b?)",
-                                              "VPSHUFHWZmi(b?)",
-                                              "VPSHUFLWYmi",
-                                              "VPSHUFLWZ256mi(b?)",
-                                              "VPSHUFLWZmi(b?)",
-                                              "VPSLLDQZ256rm(b?)",
-                                              "VPSLLDQZrm(b?)",
-                                              "VPSRLDQZ256rm(b?)",
-                                              "VPSRLDQZrm(b?)",
-                                              "VPUNPCKHBWYrm",
-                                              "VPUNPCKHBWZ256rm(b?)",
-                                              "VPUNPCKHBWZrm(b?)",
-                                              "VPUNPCKHDQYrm",
-                                              "VPUNPCKHDQZ256rm(b?)",
-                                              "VPUNPCKHDQZrm(b?)",
-                                              "VPUNPCKHQDQYrm",
-                                              "VPUNPCKHQDQZ256rm(b?)",
-                                              "VPUNPCKHQDQZrm(b?)",
-                                              "VPUNPCKHWDYrm",
-                                              "VPUNPCKHWDZ256rm(b?)",
-                                              "VPUNPCKHWDZrm(b?)",
-                                              "VPUNPCKLBWYrm",
-                                              "VPUNPCKLBWZ256rm(b?)",
-                                              "VPUNPCKLBWZrm(b?)",
-                                              "VPUNPCKLDQYrm",
-                                              "VPUNPCKLDQZ256rm(b?)",
-                                              "VPUNPCKLDQZrm(b?)",
-                                              "VPUNPCKLQDQYrm",
-                                              "VPUNPCKLQDQZ256rm(b?)",
-                                              "VPUNPCKLQDQZrm(b?)",
-                                              "VPUNPCKLWDYrm",
-                                              "VPUNPCKLWDZ256rm(b?)",
-                                              "VPUNPCKLWDZrm(b?)")>;
+                                              "VPMOVSXWQYrm")>;
 
 def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> {
   let Latency = 8;

Modified: llvm/trunk/lib/Target/X86/X86Schedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Schedule.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Schedule.td (original)
+++ llvm/trunk/lib/Target/X86/X86Schedule.td Wed May  2 11:48:23 2018
@@ -139,9 +139,13 @@ defm WriteVecShift : X86SchedWritePair;
 defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
 defm WritePMULLD : X86SchedWritePair; // PMULLD
 defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM/ZMM).
 defm WriteVarShuffle  : X86SchedWritePair; // Vector variable shuffles.
+defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM/ZMM).
 defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM/ZMM).
 defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
+defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM/ZMM).
 defm WritePSADBW : X86SchedWritePair; // Vector PSADBW.
 defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
 defm WritePHMINPOS : X86SchedWritePair; // Vector PHMINPOS.
@@ -264,15 +268,15 @@ def SchedWritePSADBW
 
 def SchedWriteShuffle
  : X86SchedWriteWidths<WriteShuffle, WriteShuffle,
-                       WriteShuffle, WriteShuffle>;
+                       WriteShuffleY, WriteShuffleY>;
 def SchedWriteVarShuffle
  : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffle,
-                       WriteVarShuffle, WriteVarShuffle>;
+                       WriteVarShuffleY, WriteVarShuffleY>;
 def SchedWriteBlend
- : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlend, WriteBlend>;
+ : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendY>;
 def SchedWriteVarBlend
  : X86SchedWriteWidths<WriteVarBlend, WriteVarBlend,
-                       WriteVarBlend, WriteVarBlend>;
+                       WriteVarBlendY, WriteVarBlendY>;
 
 //===----------------------------------------------------------------------===//
 // Generic Processor Scheduler Models.

Modified: llvm/trunk/lib/Target/X86/X86ScheduleAtom.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleAtom.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleAtom.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleAtom.td Wed May  2 11:48:23 2018
@@ -261,9 +261,13 @@ defm : AtomWriteResPair<WritePHMINPOS,
 defm : AtomWriteResPair<WriteMPSAD,        [AtomPort01],  [AtomPort0], 1, 1>;
 defm : AtomWriteResPair<WritePSADBW,        [AtomPort0],  [AtomPort0], 5, 5, [5], [5]>;
 defm : AtomWriteResPair<WriteShuffle,       [AtomPort0],  [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteShuffleY,      [AtomPort0],  [AtomPort0], 1, 1>;
 defm : AtomWriteResPair<WriteVarShuffle,   [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
+defm : AtomWriteResPair<WriteVarShuffleY,  [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
 defm : AtomWriteResPair<WriteBlend,         [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
+defm : AtomWriteResPair<WriteBlendY,        [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
 defm : AtomWriteResPair<WriteVarBlend,      [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
+defm : AtomWriteResPair<WriteVarBlendY,     [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
 defm : AtomWriteResPair<WriteShuffle256,    [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
 defm : AtomWriteResPair<WriteVarShuffle256, [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.
 defm : AtomWriteResPair<WriteVarVecShift,   [AtomPort0],  [AtomPort0]>; // NOTE: Doesn't exist on Atom.

Modified: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td Wed May  2 11:48:23 2018
@@ -411,9 +411,13 @@ defm : JWriteResFpuPair<WriteMPSAD,
 defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
 defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU0,  JVALU], 2>;
 defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteShuffleY,    [JFPU01, JVALU], 1>;
 defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : JWriteResFpuPair<WriteVarShuffleY, [JFPU01, JVALU], 2, [1, 4], 3>;
 defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteBlendY,      [JFPU01, JVALU], 1>;
 defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : JWriteResFpuPair<WriteVarBlendY,   [JFPU01, JVALU], 2, [1, 4], 3>;
 defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
 defm : JWriteResFpuPair<WriteVecLogicY,   [JFPU01, JVALU], 1>; // NOTE: Doesn't exist on Jaguar.
 defm : JWriteResFpuPair<WriteShuffle256,  [JFPU01, JVALU], 1>;

Modified: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleSLM.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td Wed May  2 11:48:23 2018
@@ -171,8 +171,11 @@ defm : SLMWriteResPair<WriteVecIMul,  [S
 //defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   11, [11], 7>;
 defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   4>;
 defm : SLMWriteResPair<WriteShuffle,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteVarShuffle,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteBlend,  [SLM_FPC_RSV0],  1>;
+defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteMPSAD,  [SLM_FPC_RSV0],  7>;
 defm : SLMWriteResPair<WritePSADBW, [SLM_FPC_RSV0],  4>;
 defm : SLMWriteResPair<WritePHMINPOS,  [SLM_FPC_RSV0],   4>;
@@ -290,6 +293,7 @@ def : WriteRes<WriteNop, []>;
 def  : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
 defm : SLMWriteResPair<WriteFBlendY, [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarBlendY,[SLM_FPC_RSV0], 1>;
 defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>;
 defm : SLMWriteResPair<WriteFVarBlendY, [SLM_FPC_RSV0], 1>;
 defm : SLMWriteResPair<WriteFShuffle256, [SLM_FPC_RSV0],  1>;

Modified: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td (original)
+++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td Wed May  2 11:48:23 2018
@@ -203,6 +203,7 @@ defm : ZnWriteResFpuPair<WriteFBlendY,
 defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteFVarBlendY,[ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteVarBlend,  [ZnFPU0],  1>;
+defm : ZnWriteResFpuPair<WriteVarBlendY, [ZnFPU0],  1>;
 defm : ZnWriteResFpuPair<WriteCvtI2F,    [ZnFPU3],  5>;
 defm : ZnWriteResFpuPair<WriteCvtF2F,    [ZnFPU3],  5>;
 defm : ZnWriteResFpuPair<WriteCvtF2I,    [ZnFPU3],  5>;
@@ -241,8 +242,11 @@ defm : ZnWriteResFpuPair<WriteVecALU,
 defm : ZnWriteResFpuPair<WriteVecIMul,    [ZnFPU0],  4>;
 defm : ZnWriteResFpuPair<WritePMULLD,     [ZnFPU0],  4>; // FIXME
 defm : ZnWriteResFpuPair<WriteShuffle,    [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteShuffleY,   [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU],   1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteBlend,      [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteBlendY,     [ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU],   2>;
 defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU],   2>;
 defm : ZnWriteResFpuPair<WritePSADBW,     [ZnFPU0],  3>;

Modified: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-schedule.ll?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll Wed May  2 11:48:23 2018
@@ -803,7 +803,7 @@ define <16 x i16> @test_packssdw(<8 x i3
 ; GENERIC-LABEL: test_packssdw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packssdw:
@@ -847,7 +847,7 @@ define <32 x i8> @test_packsswb(<16 x i1
 ; GENERIC-LABEL: test_packsswb:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packsswb:
@@ -891,7 +891,7 @@ define <16 x i16> @test_packusdw(<8 x i3
 ; GENERIC-LABEL: test_packusdw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packusdw:
@@ -935,7 +935,7 @@ define <32 x i8> @test_packuswb(<16 x i1
 ; GENERIC-LABEL: test_packuswb:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packuswb:
@@ -1621,7 +1621,7 @@ define <8 x i32> @test_pblendd_ymm(<8 x
 ; GENERIC-LABEL: test_pblendd_ymm:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
-; GENERIC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [7:0.50]
+; GENERIC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1670,7 +1670,7 @@ define <32 x i8> @test_pblendvb(<32 x i8
 ; GENERIC-LABEL: test_pblendvb:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; GENERIC-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pblendvb:
@@ -1713,7 +1713,7 @@ define <16 x i16> @test_pblendw(<16 x i1
 ; GENERIC-LABEL: test_pblendw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:0.50]
-; GENERIC-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [7:0.50]
+; GENERIC-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:0.50]
 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5132,7 +5132,7 @@ define <32 x i8> @test_pshufb(<32 x i8>
 ; GENERIC-LABEL: test_pshufb:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pshufb:
@@ -5175,7 +5175,7 @@ define <8 x i32> @test_pshufd(<8 x i32>
 ; GENERIC-LABEL: test_pshufd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5224,7 +5224,7 @@ define <16 x i16> @test_pshufhw(<16 x i1
 ; GENERIC-LABEL: test_pshufhw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5273,7 +5273,7 @@ define <16 x i16> @test_pshuflw(<16 x i1
 ; GENERIC-LABEL: test_pshuflw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6689,7 +6689,7 @@ define <32 x i8> @test_punpckhbw(<32 x i
 ; GENERIC-LABEL: test_punpckhbw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhbw:
@@ -6731,7 +6731,7 @@ define <8 x i32> @test_punpckhdq(<8 x i3
 ; GENERIC-LABEL: test_punpckhdq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -6786,7 +6786,7 @@ define <4 x i64> @test_punpckhqdq(<4 x i
 ; GENERIC-LABEL: test_punpckhqdq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -6835,7 +6835,7 @@ define <16 x i16> @test_punpckhwd(<16 x
 ; GENERIC-LABEL: test_punpckhwd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhwd:
@@ -6877,7 +6877,7 @@ define <32 x i8> @test_punpcklbw(<32 x i
 ; GENERIC-LABEL: test_punpcklbw:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpcklbw:
@@ -6919,7 +6919,7 @@ define <8 x i32> @test_punpckldq(<8 x i3
 ; GENERIC-LABEL: test_punpckldq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00]
 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -6974,7 +6974,7 @@ define <4 x i64> @test_punpcklqdq(<4 x i
 ; GENERIC-LABEL: test_punpcklqdq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7023,7 +7023,7 @@ define <16 x i16> @test_punpcklwd(<16 x
 ; GENERIC-LABEL: test_punpcklwd:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
-; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [6:1.00]
+; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpcklwd:

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Wed May  2 11:48:23 2018
@@ -7589,7 +7589,7 @@ define <32 x i16> @test_build_vec_v32i1(
 define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
 ; GENERIC-LABEL: test_build_vec_v64i1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_build_vec_v64i1:

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Wed May  2 11:48:23 2018
@@ -4535,7 +4535,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4555,7 +4555,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
@@ -4572,7 +4572,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4592,7 +4592,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
@@ -4609,7 +4609,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4629,7 +4629,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
@@ -4659,7 +4659,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4679,7 +4679,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
@@ -4713,7 +4713,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
@@ -4734,7 +4734,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
@@ -4755,7 +4755,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
@@ -4776,7 +4776,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
@@ -4797,7 +4797,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
@@ -4818,7 +4818,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
@@ -4855,7 +4855,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
@@ -4876,7 +4876,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
@@ -4895,7 +4895,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
 ; GENERIC-LABEL: test_32xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi8_perm_mask0:
@@ -4909,7 +4909,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4929,7 +4929,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
@@ -4946,7 +4946,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -4966,7 +4966,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
@@ -4983,7 +4983,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5003,7 +5003,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
@@ -5019,7 +5019,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
 ; GENERIC-LABEL: test_32xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi8_perm_mask3:
@@ -5033,7 +5033,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5053,7 +5053,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
@@ -5070,7 +5070,7 @@ define <32 x i8> @test_32xi8_perm_mem_ma
 ; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi8_perm_mem_mask0:
@@ -5087,7 +5087,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
@@ -5108,7 +5108,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
@@ -5129,7 +5129,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
@@ -5150,7 +5150,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
@@ -5171,7 +5171,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
@@ -5192,7 +5192,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
@@ -5212,7 +5212,7 @@ define <32 x i8> @test_32xi8_perm_mem_ma
 ; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi8_perm_mem_mask3:
@@ -5229,7 +5229,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
@@ -5250,7 +5250,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
@@ -5269,7 +5269,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
 ; GENERIC-LABEL: test_64xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_64xi8_perm_mask0:
@@ -5283,7 +5283,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5303,7 +5303,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
@@ -5320,7 +5320,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5340,7 +5340,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
@@ -5357,7 +5357,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5377,7 +5377,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
@@ -5393,7 +5393,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
 ; GENERIC-LABEL: test_64xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_64xi8_perm_mask3:
@@ -5407,7 +5407,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -5427,7 +5427,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
@@ -5444,7 +5444,7 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 ; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_64xi8_perm_mem_mask0:
@@ -5461,7 +5461,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
@@ -5482,7 +5482,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
@@ -5503,7 +5503,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
@@ -5524,7 +5524,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
@@ -5545,7 +5545,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
@@ -5566,7 +5566,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
@@ -5586,7 +5586,7 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 ; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_64xi8_perm_mem_mask3:
@@ -5603,7 +5603,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
@@ -5624,7 +5624,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50]
 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
@@ -6659,7 +6659,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
 ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
@@ -6674,7 +6674,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
@@ -6693,7 +6693,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
@@ -6712,7 +6712,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
@@ -6731,7 +6731,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
@@ -6750,7 +6750,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
@@ -6769,7 +6769,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
@@ -6787,7 +6787,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
 ; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
@@ -6802,7 +6802,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
@@ -6821,7 +6821,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
@@ -6840,7 +6840,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
@@ -6859,7 +6859,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
@@ -6878,7 +6878,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
@@ -6897,7 +6897,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
@@ -6915,7 +6915,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
 ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
@@ -6930,7 +6930,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
@@ -6949,7 +6949,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
@@ -6968,7 +6968,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
@@ -6987,7 +6987,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
@@ -7340,7 +7340,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
 ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
@@ -7355,7 +7355,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
@@ -7374,7 +7374,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
@@ -7393,7 +7393,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
@@ -7412,7 +7412,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
@@ -7431,7 +7431,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
@@ -7450,7 +7450,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
@@ -7468,7 +7468,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
 ; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
@@ -7483,7 +7483,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
@@ -7502,7 +7502,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
@@ -7521,7 +7521,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
@@ -7540,7 +7540,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
@@ -7558,7 +7558,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -7579,7 +7579,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -7600,7 +7600,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
 ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
@@ -7615,7 +7615,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
@@ -7634,7 +7634,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
@@ -7653,7 +7653,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
@@ -7672,7 +7672,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [6:1.00]
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
@@ -8233,7 +8233,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
@@ -8252,7 +8252,7 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
@@ -8271,7 +8271,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
@@ -8290,7 +8290,7 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
@@ -8309,7 +8309,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
@@ -8328,7 +8328,7 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
@@ -8361,7 +8361,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
@@ -8380,7 +8380,7 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
@@ -8587,7 +8587,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
@@ -8606,7 +8606,7 @@ define <16 x i32> @test2_masked_z_16xi32
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
@@ -8625,7 +8625,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
@@ -8644,7 +8644,7 @@ define <16 x i32> @test2_masked_z_16xi32
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
@@ -8663,7 +8663,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
@@ -8682,7 +8682,7 @@ define <16 x i32> @test2_masked_z_16xi32
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
@@ -8715,7 +8715,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
@@ -8734,7 +8734,7 @@ define <16 x i32> @test2_masked_z_16xi32
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [6:1.00]
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:

Modified: llvm/trunk/test/CodeGen/X86/xop-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xop-schedule.ll?rev=331386&r1=331385&r2=331386&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xop-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xop-schedule.ll Wed May  2 11:48:23 2018
@@ -124,8 +124,8 @@ define void @test_vpcmov_256(<4 x i64> %
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00]
-; GENERIC-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -844,8 +844,8 @@ define void @test_vpperm(<2 x i64> %a0,
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    #APP
 ; GENERIC-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;




More information about the llvm-commits mailing list