[llvm] r320279 - [X86] Fix bad regular expressions in the scheduler models. Question marks should be outside of multicharacter parenthesized expressions

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Dec 9 17:24:08 PST 2017


Author: ctopper
Date: Sat Dec  9 17:24:08 2017
New Revision: 320279

URL: http://llvm.org/viewvc/llvm-project?rev=320279&view=rev
Log:
[X86] Fix bad regular expressions in the scheduler models. Question marks should be outside of multicharacter parenthesized expressions

If the question mark is inside the parentheses it only applies to the single character proceeding it.

I had to make a few additional cleanups to fix some duplicate warnings that were exposed by fixing this.

Modified:
    llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
    llvm/trunk/lib/Target/X86/X86SchedHaswell.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
    llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
    llvm/trunk/test/CodeGen/X86/avx-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
    llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll
    llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
    llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
    llvm/trunk/test/CodeGen/X86/sha-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse-schedule.ll

Modified: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedBroadwell.td?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td Sat Dec  9 17:24:08 2017
@@ -403,18 +403,18 @@ def: InstRW<[BWWriteResGroup3], (instreg
 def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVDDUPrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVDI2PDIrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVHLPSrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVSHDUPrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "ORPDrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "ORPSrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "PACKSSDWrr")>;
@@ -466,25 +466,25 @@ def: InstRW<[BWWriteResGroup3], (instreg
 def: InstRW<[BWWriteResGroup3], (instregex "VBROADCASTSSrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VINSERTPSrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPYrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVHLPSrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VORPDYrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VORPDrr")>;
 def: InstRW<[BWWriteResGroup3], (instregex "VORPSYrr")>;
@@ -591,8 +591,8 @@ def BWWriteResGroup6 : SchedWriteRes<[BW
   let ResourceCycles = [1];
 }
 def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV?)")>;
-def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup6], (instregex "ADCX32rr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "ADCX64rr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "ADOX32rr")>;
@@ -664,8 +664,8 @@ def: InstRW<[BWWriteResGroup6], (instreg
 def: InstRW<[BWWriteResGroup6], (instregex "SARX32rr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "SARX64rr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV?)")>;
-def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup6], (instregex "SETAEr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "SETBr")>;
 def: InstRW<[BWWriteResGroup6], (instregex "SETEr")>;
@@ -881,13 +881,13 @@ def BWWriteResGroup8 : SchedWriteRes<[BW
 def: InstRW<[BWWriteResGroup8], (instregex "BLENDPDrri")>;
 def: InstRW<[BWWriteResGroup8], (instregex "BLENDPSrri")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDNirr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDirr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MMX_PORirr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MMX_PXORirr")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV?)")>;
-def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup8], (instregex "MOVPQI2QIrr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "PANDNrr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "PANDrr")>;
@@ -897,10 +897,10 @@ def: InstRW<[BWWriteResGroup8], (instreg
 def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDrri")>;
 def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSYrri")>;
 def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSrri")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV?)")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV?)")>;
-def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV?)")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV)?")>;
 def: InstRW<[BWWriteResGroup8], (instregex "VMOVPQI2QIrr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "VMOVZPQILo2PQIrr")>;
 def: InstRW<[BWWriteResGroup8], (instregex "VPANDNYrr")>;
@@ -920,33 +920,32 @@ def BWWriteResGroup9 : SchedWriteRes<[BW
   let ResourceCycles = [1];
 }
 def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "ADD8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "ADD8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "AND8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "AND8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CBW")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CLC")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CMC")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CMP8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CMP8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "CWDE")>;
 def: InstRW<[BWWriteResGroup9], (instregex "DEC(16|32|64)r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "DEC8r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "INC(16|32|64)r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "INC8r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "LAHF")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV?)")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri_alt")>;
-def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr16")>;
 def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr32")>;
 def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr8")>;
@@ -958,10 +957,10 @@ def: InstRW<[BWWriteResGroup9], (instreg
 def: InstRW<[BWWriteResGroup9], (instregex "NOT(16|32|64)r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "NOT8r")>;
 def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "OR8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "OR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SAHF")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SIDT64m")>;
@@ -970,10 +969,10 @@ def: InstRW<[BWWriteResGroup9], (instreg
 def: InstRW<[BWWriteResGroup9], (instregex "STC")>;
 def: InstRW<[BWWriteResGroup9], (instregex "STRm")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SUB8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SUB8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "SYSCALL")>;
 def: InstRW<[BWWriteResGroup9], (instregex "TEST(16|32|64)rr")>;
 def: InstRW<[BWWriteResGroup9], (instregex "TEST8i8")>;
@@ -981,10 +980,10 @@ def: InstRW<[BWWriteResGroup9], (instreg
 def: InstRW<[BWWriteResGroup9], (instregex "TEST8rr")>;
 def: InstRW<[BWWriteResGroup9], (instregex "XCHG(16|32|64)rr")>;
 def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)ri8")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV)?")>;
 def: InstRW<[BWWriteResGroup9], (instregex "XOR8i8")>;
 def: InstRW<[BWWriteResGroup9], (instregex "XOR8ri")>;
-def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV?)")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV)?")>;
 
 def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> {
   let Latency = 1;
@@ -1252,8 +1251,7 @@ def BWWriteResGroup25 : SchedWriteRes<[B
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr")>;
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r(mr)?")>;
 def: InstRW<[BWWriteResGroup25], (instregex "PUSH64i8")>;
 def: InstRW<[BWWriteResGroup25], (instregex "STOSB")>;
 def: InstRW<[BWWriteResGroup25], (instregex "STOSL")>;
@@ -1299,7 +1297,7 @@ def: InstRW<[BWWriteResGroup27], (instre
 def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>;
 def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>;
 def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8?)")>;
+def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8)?")>;
 def: InstRW<[BWWriteResGroup27], (instregex "IMUL8r")>;
 def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>;
 def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>;
@@ -1382,7 +1380,7 @@ def BWWriteResGroup27_16 : SchedWriteRes
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8?)")>;
+def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8)?")>;
 
 def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
   let Latency = 3;
@@ -2383,8 +2381,7 @@ def: InstRW<[BWWriteResGroup66], (instre
 def: InstRW<[BWWriteResGroup66], (instregex "CMP8rm")>;
 def: InstRW<[BWWriteResGroup66], (instregex "OR(16|32|64)rm")>;
 def: InstRW<[BWWriteResGroup66], (instregex "OR8rm")>;
-def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r")>;
-def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>;
+def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r(mr)?")>;
 def: InstRW<[BWWriteResGroup66], (instregex "SUB(16|32|64)rm")>;
 def: InstRW<[BWWriteResGroup66], (instregex "SUB8rm")>;
 def: InstRW<[BWWriteResGroup66], (instregex "TEST(16|32|64)mr")>;
@@ -2779,7 +2776,7 @@ def: InstRW<[BWWriteResGroup91], (instre
 def: InstRW<[BWWriteResGroup91], (instregex "CVTPS2DQrm")>;
 def: InstRW<[BWWriteResGroup91], (instregex "CVTTPS2DQrm")>;
 def: InstRW<[BWWriteResGroup91], (instregex "IMUL64m")>;
-def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8?)")>;
+def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8)?")>;
 def: InstRW<[BWWriteResGroup91], (instregex "IMUL8m")>;
 def: InstRW<[BWWriteResGroup91], (instregex "LZCNT(16|32|64)rm")>;
 def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PDrm")>;
@@ -2842,7 +2839,7 @@ def BWWriteResGroup91_16 : SchedWriteRes
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1]; 
 }
-def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8?)")>;
+def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8)?")>;
 
 def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
   let Latency = 8;
@@ -3826,7 +3823,7 @@ def BWWriteResGroup176 : SchedWriteRes<[
   let NumMicroOps = 19;
   let ResourceCycles = [3,1,15];
 }
-def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64?)")>;
+def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64)?")>;
 
 def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
   let Latency = 24;
@@ -3954,7 +3951,7 @@ def BWWriteResGroup186 : SchedWriteRes<[
   let NumMicroOps = 28;
   let ResourceCycles = [1,6,1,1,19];
 }
-def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT?)")>;
+def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT)?")>;
 
 def BWWriteResGroup187 : SchedWriteRes<[BWPort01,BWPort15,BWPort015,BWPort0156]> {
   let Latency = 31;

Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Sat Dec  9 17:24:08 2017
@@ -977,12 +977,12 @@ def: InstRW<[HWWriteResGroup4], (instreg
 def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
@@ -1034,25 +1034,25 @@ def: InstRW<[HWWriteResGroup4], (instreg
 def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
 def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
@@ -1425,13 +1425,13 @@ def HWWriteResGroup9 : SchedWriteRes<[HW
 def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
 def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV?)")>;
-def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
@@ -1441,10 +1441,10 @@ def: InstRW<[HWWriteResGroup9], (instreg
 def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
 def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
 def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV?)")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV?)")>;
-def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
 def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
 def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
@@ -1464,32 +1464,32 @@ def HWWriteResGroup10 : SchedWriteRes<[H
   let ResourceCycles = [1];
 }
 def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>;
 def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>;
 def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
 def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV?)")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt?)")>;
-def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
 def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
 def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
@@ -1501,10 +1501,10 @@ def: InstRW<[HWWriteResGroup10], (instre
 def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
 def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>;
 def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
@@ -1513,10 +1513,10 @@ def: InstRW<[HWWriteResGroup10], (instre
 def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
 def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
 def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
 def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>;
@@ -1594,7 +1594,7 @@ def: InstRW<[HWWriteResGroup12], (instre
 def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>;
 def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>;
 def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8?)")>;
+def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8)?")>;
 def: InstRW<[HWWriteResGroup12], (instregex "IMUL8m")>;
 def: InstRW<[HWWriteResGroup12], (instregex "LZCNT(16|32|64)rm")>;
 def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SDrm")>;
@@ -2084,7 +2084,7 @@ def: InstRW<[HWWriteResGroup18], (instre
 def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>;
 def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>;
 def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>;
-def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr?)")>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr)?")>;
 def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>;
 def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>;
 def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)mr")>;
@@ -2164,7 +2164,7 @@ def HWWriteResGroup24 : SchedWriteRes<[H
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr?)")>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr)?")>;
 def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>;
 def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>;
 def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>;
@@ -2355,10 +2355,10 @@ def HWWriteResGroup35 : SchedWriteRes<[H
   let ResourceCycles = [1,1];
 }
 def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>;
 def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>;
 def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>;
 def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>;
@@ -2376,10 +2376,10 @@ def: InstRW<[HWWriteResGroup35], (instre
 def: InstRW<[HWWriteResGroup35], (instregex "CWD")>;
 def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>;
 def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri8")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>;
 def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>;
-def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV)?")>;
 def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>;
 def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>;
 
@@ -2597,7 +2597,7 @@ def: InstRW<[HWWriteResGroup50], (instre
 def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>;
 def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>;
 def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>;
-def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8?)")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8)?")>;
 def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>;
 def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>;
 def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>;
@@ -2679,13 +2679,13 @@ def HWWriteResGroup50_16 : SchedWriteRes
   let Latency = 3;
   let NumMicroOps = 4;
 }
-def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8?)")>;
+def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8)?")>;
 
 def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> {
   let Latency = 3;
   let NumMicroOps = 3;
 }
-def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8?)")>;
+def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8)?")>;
 
 def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
   let Latency = 3;
@@ -4233,7 +4233,7 @@ def HWWriteResGroup151 : SchedWriteRes<[
   let NumMicroOps = 19;
   let ResourceCycles = [3,1,15];
 }
-def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64?)")>;
+def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>;
 
 def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
   let Latency = 19;
@@ -4358,7 +4358,7 @@ def HWWriteResGroup165 : SchedWriteRes<[
   let NumMicroOps = 28;
   let ResourceCycles = [1,6,1,1,19];
 }
-def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT?)")>;
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT)?")>;
 
 def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
   let Latency = 34;

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td Sat Dec  9 17:24:08 2017
@@ -380,11 +380,11 @@ def: InstRW<[SKLWriteResGroup3], (instre
 def: InstRW<[SKLWriteResGroup3], (instregex "MOVDI2PDIrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "MOVHLPSrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "MOVSHDUPrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSDWrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSWBrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSDWrr")>;
@@ -433,15 +433,15 @@ def: InstRW<[SKLWriteResGroup3], (instre
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVHLPSrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVLHPSrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWYrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWrr")>;
 def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBYrr")>;
@@ -676,7 +676,7 @@ def SKLWriteResGroup6 : SchedWriteRes<[S
 }
 def: InstRW<[SKLWriteResGroup6], (instregex "FINCSTP")>;
 def: InstRW<[SKLWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSBrr64")>;
 def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSDrr64")>;
 def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSWrr64")>;
@@ -702,8 +702,8 @@ def SKLWriteResGroup7 : SchedWriteRes<[S
   let ResourceCycles = [1];
 }
 def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "ADCX32rr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "ADCX64rr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "ADOX32rr")>;
@@ -776,8 +776,8 @@ def: InstRW<[SKLWriteResGroup7], (instre
 def: InstRW<[SKLWriteResGroup7], (instregex "SARX32rr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "SARX64rr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "SETAEr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "SETBr")>;
 def: InstRW<[SKLWriteResGroup7], (instregex "SETEr")>;
@@ -835,12 +835,12 @@ def: InstRW<[SKLWriteResGroup9], (instre
 def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPDrri")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPSrri")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "ORPDrr")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "ORPSrr")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "PADDBrr")>;
@@ -867,16 +867,16 @@ def: InstRW<[SKLWriteResGroup9], (instre
 def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDrri")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSYrri")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VORPDYrr")>;
 def: InstRW<[SKLWriteResGroup9], (instregex "VORPDrr")>;
@@ -921,32 +921,32 @@ def SKLWriteResGroup10 : SchedWriteRes<[
   let ResourceCycles = [1];
 }
 def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "ADD8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "AND8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CBW")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CLC")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CMC")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CMP8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "CWDE")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "DEC(16|32|64)r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "DEC8r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "INC(16|32|64)r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "INC8r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt?)")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
@@ -958,10 +958,10 @@ def: InstRW<[SKLWriteResGroup10], (instr
 def: InstRW<[SKLWriteResGroup10], (instregex "NOT(16|32|64)r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "NOT8r")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "OR8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SAHF")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SIDT64m")>;
@@ -970,10 +970,10 @@ def: InstRW<[SKLWriteResGroup10], (instr
 def: InstRW<[SKLWriteResGroup10], (instregex "STC")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "STRm")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SUB8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "SYSCALL")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "TEST8i8")>;
@@ -981,10 +981,10 @@ def: InstRW<[SKLWriteResGroup10], (instr
 def: InstRW<[SKLWriteResGroup10], (instregex "TEST8rr")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)ri8")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "XOR8i8")>;
 def: InstRW<[SKLWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV?)")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
 
 def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> {
   let Latency = 1;
@@ -1282,8 +1282,7 @@ def SKLWriteResGroup28 : SchedWriteRes<[
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)rmr")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
 def: InstRW<[SKLWriteResGroup28], (instregex "PUSH64i8")>;
 def: InstRW<[SKLWriteResGroup28], (instregex "STOSB")>;
 def: InstRW<[SKLWriteResGroup28], (instregex "STOSL")>;
@@ -1297,7 +1296,7 @@ def SKLWriteResGroup29 : SchedWriteRes<[
 }
 def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>;
 def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8?)")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8)?")>;
 def: InstRW<[SKLWriteResGroup29], (instregex "IMUL8r")>;
 def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>;
 def: InstRW<[SKLWriteResGroup29], (instregex "MUL8r")>;
@@ -1315,13 +1314,13 @@ def SKLWriteResGroup29_16 : SchedWriteRe
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8?)")>;
+def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8)?")>;
 
 def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
 }
-def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8?)")>;
+def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8)?")>;
 
 def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
   let Latency = 3;
@@ -2230,8 +2229,7 @@ def: InstRW<[SKLWriteResGroup76], (instr
 def: InstRW<[SKLWriteResGroup76], (instregex "CMP8rm")>;
 def: InstRW<[SKLWriteResGroup76], (instregex "OR(16|32|64)rm")>;
 def: InstRW<[SKLWriteResGroup76], (instregex "OR8rm")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r")>;
-def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)rmr")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r(mr)?")>;
 def: InstRW<[SKLWriteResGroup76], (instregex "SUB(16|32|64)rm")>;
 def: InstRW<[SKLWriteResGroup76], (instregex "SUB8rm")>;
 def: InstRW<[SKLWriteResGroup76], (instregex "TEST(16|32|64)mr")>;
@@ -2776,7 +2774,7 @@ def SKLWriteResGroup107 : SchedWriteRes<
 def: InstRW<[SKLWriteResGroup107], (instregex "BSF(16|32|64)rm")>;
 def: InstRW<[SKLWriteResGroup107], (instregex "BSR(16|32|64)rm")>;
 def: InstRW<[SKLWriteResGroup107], (instregex "IMUL64m")>;
-def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8?)")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8)?")>;
 def: InstRW<[SKLWriteResGroup107], (instregex "IMUL8m")>;
 def: InstRW<[SKLWriteResGroup107], (instregex "LZCNT(16|32|64)rm")>;
 def: InstRW<[SKLWriteResGroup107], (instregex "MUL(16|32|64)m")>;
@@ -2793,7 +2791,7 @@ def SKLWriteResGroup107_16 : SchedWriteR
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1]; 
 }
-def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8?)")>;
+def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8)?")>;
 
 def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
   let Latency = 3;
@@ -4119,7 +4117,7 @@ def SKLWriteResGroup211 : SchedWriteRes<
   let NumMicroOps = 31;
   let ResourceCycles = [1,8,1,21];
 }
-def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64?)")>;
+def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64)?")>;
 
 def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> {
   let Latency = 40;
@@ -4147,7 +4145,7 @@ def SKLWriteResGroup215 : SchedWriteRes<
   let NumMicroOps = 40;
   let ResourceCycles = [1,11,1,1,26];
 }
-def: InstRW<[SKLWriteResGroup215], (instregex "XSAVE")>;
+def: InstRW<[SKLWriteResGroup215], (instregex "^XSAVE$", "XSAVEC", "XSAVES")>;
 
 def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
   let Latency = 46;

Modified: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td Sat Dec  9 17:24:08 2017
@@ -424,11 +424,11 @@ def: InstRW<[SKXWriteResGroup3], (instre
 def: InstRW<[SKXWriteResGroup3], (instregex "MOVDI2PDIrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "MOVHLPSrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "MOVLHPSrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "MOVSHDUPrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "MOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSDWrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSWBrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSDWrr")>;
@@ -487,7 +487,7 @@ def: InstRW<[SKXWriteResGroup3], (instre
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSZrr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ128rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ256rr(b?)(k?)(z?)")>;
@@ -498,11 +498,11 @@ def: InstRW<[SKXWriteResGroup3], (instre
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ256rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZrr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPrr")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWYrr")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ128rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ256rr(b?)(k?)(z?)")>;
@@ -979,7 +979,7 @@ def SKXWriteResGroup6 : SchedWriteRes<[S
 }
 def: InstRW<[SKXWriteResGroup6], (instregex "FINCSTP")>;
 def: InstRW<[SKXWriteResGroup6], (instregex "FNOP")>;
-def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSBrr64")>;
 def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSDrr64")>;
 def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSWrr64")>;
@@ -1005,8 +1005,8 @@ def SKXWriteResGroup7 : SchedWriteRes<[S
   let ResourceCycles = [1];
 }
 def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "ADCX32rr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "ADCX64rr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "ADOX32rr")>;
@@ -1079,8 +1079,8 @@ def: InstRW<[SKXWriteResGroup7], (instre
 def: InstRW<[SKXWriteResGroup7], (instregex "SARX32rr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "SARX64rr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "SETAEr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "SETBr")>;
 def: InstRW<[SKXWriteResGroup7], (instregex "SETEr")>;
@@ -1138,12 +1138,12 @@ def: InstRW<[SKXWriteResGroup9], (instre
 def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPDrri")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPSrri")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "MOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "ORPDrr")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "ORPSrr")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "PADDBrr")>;
@@ -1188,47 +1188,47 @@ def: InstRW<[SKXWriteResGroup9], (instre
 def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDrri")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSYrri")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSrri")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ128rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ256rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZrr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z256rr(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI(2Q|Lo2PQ)IZrr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV?)")>;
-def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV)?")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VORPDYrr")>;
 def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ128rr(b?)(k?)(z?)")>;
@@ -1350,33 +1350,32 @@ def SKXWriteResGroup10 : SchedWriteRes<[
   let ResourceCycles = [1];
 }
 def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "ADD8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "ADD8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "AND8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "AND8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CBW")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CLC")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CMC")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CMP8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CMP8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "CWDE")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "DEC(16|32|64)r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "DEC8r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "INC(16|32|64)r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "INC8r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "LAHF")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV?)")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri_alt")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
@@ -1388,10 +1387,10 @@ def: InstRW<[SKXWriteResGroup10], (instr
 def: InstRW<[SKXWriteResGroup10], (instregex "NOT(16|32|64)r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "NOT8r")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "OR8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "OR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SAHF")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SIDT64m")>;
@@ -1400,10 +1399,10 @@ def: InstRW<[SKXWriteResGroup10], (instr
 def: InstRW<[SKXWriteResGroup10], (instregex "STC")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "STRm")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SUB8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SUB8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "SYSCALL")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "TEST8i8")>;
@@ -1411,10 +1410,10 @@ def: InstRW<[SKXWriteResGroup10], (instr
 def: InstRW<[SKXWriteResGroup10], (instregex "TEST8rr")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)ri8")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "XOR8i8")>;
 def: InstRW<[SKXWriteResGroup10], (instregex "XOR8ri")>;
-def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
 
 def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> {
   let Latency = 1;
@@ -1797,8 +1796,7 @@ def SKXWriteResGroup28 : SchedWriteRes<[
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)rmr")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
 def: InstRW<[SKXWriteResGroup28], (instregex "PUSH64i8")>;
 def: InstRW<[SKXWriteResGroup28], (instregex "STOSB")>;
 def: InstRW<[SKXWriteResGroup28], (instregex "STOSL")>;
@@ -1841,7 +1839,7 @@ def SKXWriteResGroup31 : SchedWriteRes<[
 }
 def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>;
 def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>;
-def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8?)")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8)?")>;
 def: InstRW<[SKXWriteResGroup31], (instregex "IMUL8r")>;
 def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>;
 def: InstRW<[SKXWriteResGroup31], (instregex "MUL8r")>;
@@ -1859,13 +1857,13 @@ def SKXWriteResGroup31_16 : SchedWriteRe
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8?)")>;
+def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8)?")>;
 
 def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> {
   let Latency = 3;
   let NumMicroOps = 1;
 }
-def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8?)")>;
+def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8)?")>;
 
 def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
   let Latency = 3;
@@ -1918,8 +1916,8 @@ def: InstRW<[SKXWriteResGroup32], (instr
 def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ128rri(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ256rri(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZrri(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(_Int?)(b?)(k?)(z?)")>;
-def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(_Int?)(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(_Int)?(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(_Int)?(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ128rri(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ256rri(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZrri(b?)(k?)(z?)")>;
@@ -2196,7 +2194,7 @@ def: InstRW<[SKXWriteResGroup33], (instr
 def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDrr")>;
 def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQZrr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQrr")>;
-def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV?)")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV)?")>;
 def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWri")>;
 def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWrr_REV")>;
 def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTYrr")>;
@@ -2446,9 +2444,9 @@ def: InstRW<[SKXWriteResGroup50], (instr
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ256rr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZrr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(_Int?)(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(_Int)?(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDrr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(_Int?)(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(_Int)?(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSrr")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDYrr")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDrr")>;
@@ -2520,9 +2518,9 @@ def: InstRW<[SKXWriteResGroup50], (instr
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132PSZ256r(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132PSZr(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132PSr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SDZr(_Int?)(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SDZr(_Int)?(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SDr")>;
-def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SSZr(_Int?)(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SSZr(_Int)?(b?)(k?)(z?)")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD132SSr")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD213PDYr")>;
 def: InstRW<[SKXWriteResGroup50], (instregex "VFMADD213PDZ128r(b?)(k?)(z?)")>;
@@ -3546,8 +3544,7 @@ def: InstRW<[SKXWriteResGroup81], (instr
 def: InstRW<[SKXWriteResGroup81], (instregex "CMP8rm")>;
 def: InstRW<[SKXWriteResGroup81], (instregex "OR(16|32|64)rm")>;
 def: InstRW<[SKXWriteResGroup81], (instregex "OR8rm")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r")>;
-def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)rmr")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r(mr)?")>;
 def: InstRW<[SKXWriteResGroup81], (instregex "SUB(16|32|64)rm")>;
 def: InstRW<[SKXWriteResGroup81], (instregex "SUB8rm")>;
 def: InstRW<[SKXWriteResGroup81], (instregex "TEST(16|32|64)mr")>;
@@ -4362,7 +4359,7 @@ def SKXWriteResGroup118 : SchedWriteRes<
 def: InstRW<[SKXWriteResGroup118], (instregex "BSF(16|32|64)rm")>;
 def: InstRW<[SKXWriteResGroup118], (instregex "BSR(16|32|64)rm")>;
 def: InstRW<[SKXWriteResGroup118], (instregex "IMUL64m")>;
-def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8?)")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8)?")>;
 def: InstRW<[SKXWriteResGroup118], (instregex "IMUL8m")>;
 def: InstRW<[SKXWriteResGroup118], (instregex "LZCNT(16|32|64)rm")>;
 def: InstRW<[SKXWriteResGroup118], (instregex "MUL(16|32|64)m")>;
@@ -4379,7 +4376,7 @@ def SKXWriteResGroup118_16_1 : SchedWrit
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1]; 
 }
-def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8?)")>;
+def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8)?")>;
 
 def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
   let Latency = 8;
@@ -6830,7 +6827,7 @@ def SKXWriteResGroup250 : SchedWriteRes<
   let NumMicroOps = 31;
   let ResourceCycles = [1,8,1,21];
 }
-def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64?)")>;
+def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>;
 
 def SKXWriteResGroup251 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
   let Latency = 38;

Modified: llvm/trunk/test/CodeGen/X86/avx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll Sat Dec  9 17:24:08 2017
@@ -2077,14 +2077,14 @@ define <2 x double> @test_maskmovpd(i8*
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKYLAKE-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_maskmovpd:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKX-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd:
@@ -2140,14 +2140,14 @@ define <4 x double> @test_maskmovpd_ymm(
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKYLAKE-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_maskmovpd_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKX-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
@@ -2203,14 +2203,14 @@ define <4 x float> @test_maskmovps(i8* %
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKYLAKE-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_maskmovps:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKX-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps:
@@ -2266,14 +2266,14 @@ define <8 x float> @test_maskmovps_ymm(i
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKYLAKE-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_maskmovps_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKX-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps_ymm:

Modified: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll Sat Dec  9 17:24:08 2017
@@ -3391,28 +3391,28 @@ define <4 x i32> @test_pmaskmovd(i8* %a0
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
 ; HASWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmaskmovd:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
 ; BROADWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmaskmovd:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKYLAKE-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmaskmovd:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKX-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmaskmovd:
@@ -3440,28 +3440,28 @@ define <8 x i32> @test_pmaskmovd_ymm(i8*
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
 ; HASWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmaskmovd_ymm:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
 ; BROADWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmaskmovd_ymm:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKYLAKE-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmaskmovd_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKX-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmaskmovd_ymm:
@@ -3489,28 +3489,28 @@ define <2 x i64> @test_pmaskmovq(i8* %a0
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
 ; HASWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmaskmovq:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
 ; BROADWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmaskmovq:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKYLAKE-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmaskmovq:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
 ; SKX-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmaskmovq:
@@ -3538,28 +3538,28 @@ define <4 x i64> @test_pmaskmovq_ymm(i8*
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
 ; HASWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmaskmovq_ymm:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
 ; BROADWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmaskmovq_ymm:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKYLAKE-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmaskmovq_ymm:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
 ; SKX-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmaskmovq_ymm:

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Sat Dec  9 17:24:08 2017
@@ -994,7 +994,7 @@ define <8 x double> @test_mask_broadcast
 ; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -4558,9 +4558,9 @@ define <64 x i16> @test21(<64 x i16> %x
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovb2m %zmm2, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    kshiftrq $32, %k1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
   ret <64 x i16> %ret
@@ -7447,7 +7447,7 @@ define <32 x i16> @vmov_test21(<32 x i16
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50]
 ; SKX-NEXT:    vpmovb2m %ymm1, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
@@ -7680,7 +7680,7 @@ define <32 x i16> @test_build_vec_v32i1(
 ; SKX-NEXT:    movl $1497715861, %eax # imm = 0x59455495
 ; SKX-NEXT:    # sched: [1:0.25]
 ; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
@@ -8454,7 +8454,7 @@ define   <8 x double> @_sd8xdouble_mask(
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -8588,7 +8588,7 @@ define <16 x i32> @test_vbroadcast() {
 ; SKX-NEXT:    vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    knotw %k0, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   %0 = sext <16 x i1> zeroinitializer to <16 x i32>

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Sat Dec  9 17:24:08 2017
@@ -35,7 +35,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -80,7 +80,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -125,7 +125,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -185,7 +185,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -461,7 +461,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -506,7 +506,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -551,7 +551,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -611,7 +611,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -887,7 +887,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -932,7 +932,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -977,7 +977,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1037,7 +1037,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1313,7 +1313,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1358,7 +1358,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1403,7 +1403,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1463,7 +1463,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1735,7 +1735,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1776,7 +1776,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1817,7 +1817,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1871,7 +1871,7 @@ define <4 x i64> @test_masked_4xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2125,7 +2125,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2168,7 +2168,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2211,7 +2211,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2267,7 +2267,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2310,7 +2310,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2353,7 +2353,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2411,7 +2411,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2454,7 +2454,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2910,7 +2910,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2955,7 +2955,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -3000,7 +3000,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -3060,7 +3060,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -3758,7 +3758,7 @@ define <4 x double> @test_masked_4xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -3799,7 +3799,7 @@ define <4 x double> @test_masked_4xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -3840,7 +3840,7 @@ define <4 x double> @test_masked_4xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -3894,7 +3894,7 @@ define <4 x double> @test_masked_4xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -4148,7 +4148,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4191,7 +4191,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4234,7 +4234,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4290,7 +4290,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4333,7 +4333,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4376,7 +4376,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4434,7 +4434,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4477,7 +4477,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -4929,7 +4929,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -4970,7 +4970,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -5011,7 +5011,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -5065,7 +5065,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -5335,7 +5335,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -5376,7 +5376,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -5417,7 +5417,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -5471,7 +5471,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -5741,7 +5741,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5782,7 +5782,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5823,7 +5823,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -5877,7 +5877,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -6147,7 +6147,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6188,7 +6188,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6229,7 +6229,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6283,7 +6283,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6324,7 +6324,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6365,7 +6365,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6419,7 +6419,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6460,7 +6460,7 @@ define <8 x i16> @test_masked_8xi16_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -6892,7 +6892,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -6933,7 +6933,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -6974,7 +6974,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7028,7 +7028,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7069,7 +7069,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7110,7 +7110,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7164,7 +7164,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7205,7 +7205,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -7637,7 +7637,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7678,7 +7678,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7719,7 +7719,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7773,7 +7773,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7814,7 +7814,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7855,7 +7855,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7909,7 +7909,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -7950,7 +7950,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -8229,7 +8229,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
@@ -8252,7 +8252,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
@@ -8386,7 +8386,7 @@ define <4 x i32> @test_masked_4xi32_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8427,7 +8427,7 @@ define <4 x i32> @test_masked_4xi32_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8468,7 +8468,7 @@ define <4 x i32> @test_masked_4xi32_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8522,7 +8522,7 @@ define <4 x i32> @test_masked_4xi32_perm
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8772,7 +8772,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8813,7 +8813,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8854,7 +8854,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8908,7 +8908,7 @@ define <8 x i32> @test2_masked_8xi32_per
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9158,7 +9158,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9199,7 +9199,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9240,7 +9240,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9294,7 +9294,7 @@ define <16 x i32> @test2_masked_16xi32_p
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -9544,7 +9544,7 @@ define <8 x float> @test2_8xfloat_masked
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9585,7 +9585,7 @@ define <8 x float> @test2_8xfloat_masked
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9626,7 +9626,7 @@ define <8 x float> @test2_8xfloat_masked
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9680,7 +9680,7 @@ define <8 x float> @test2_8xfloat_masked
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9735,7 +9735,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9779,7 +9779,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9823,7 +9823,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9881,7 +9881,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -10332,7 +10332,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10373,7 +10373,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10414,7 +10414,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10468,7 +10468,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10523,7 +10523,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10567,7 +10567,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10611,7 +10611,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10669,7 +10669,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10726,7 +10726,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10767,7 +10767,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10808,7 +10808,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10862,7 +10862,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -10917,7 +10917,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
@@ -10961,7 +10961,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11005,7 +11005,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
@@ -11063,7 +11063,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
@@ -11120,7 +11120,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11161,7 +11161,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11202,7 +11202,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11256,7 +11256,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11311,7 +11311,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11355,7 +11355,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11399,7 +11399,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11457,7 +11457,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11514,7 +11514,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11555,7 +11555,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11596,7 +11596,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11650,7 +11650,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -11705,7 +11705,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
@@ -11749,7 +11749,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -11793,7 +11793,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
@@ -11851,7 +11851,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -11908,7 +11908,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11949,7 +11949,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11990,7 +11990,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12044,7 +12044,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12099,7 +12099,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12143,7 +12143,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12187,7 +12187,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12245,7 +12245,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12302,7 +12302,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12343,7 +12343,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12384,7 +12384,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12438,7 +12438,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -12493,7 +12493,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
@@ -12537,7 +12537,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
@@ -12581,7 +12581,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
@@ -12639,7 +12639,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
@@ -12696,7 +12696,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -12737,7 +12737,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -12778,7 +12778,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -12832,7 +12832,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -12887,7 +12887,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12931,7 +12931,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12975,7 +12975,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -13033,7 +13033,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -13090,7 +13090,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -13131,7 +13131,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -13172,7 +13172,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -13226,7 +13226,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -13281,7 +13281,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13325,7 +13325,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13369,7 +13369,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13427,7 +13427,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13878,7 +13878,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -13919,7 +13919,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -13974,7 +13974,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -14018,7 +14018,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -14075,7 +14075,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -14116,7 +14116,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -14157,7 +14157,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -14211,7 +14211,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -14266,7 +14266,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14310,7 +14310,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14354,7 +14354,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14412,7 +14412,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14469,7 +14469,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14510,7 +14510,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14551,7 +14551,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14605,7 +14605,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -14660,7 +14660,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14704,7 +14704,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14748,7 +14748,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14806,7 +14806,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14863,7 +14863,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -14904,7 +14904,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -14945,7 +14945,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -14999,7 +14999,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -15054,7 +15054,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15098,7 +15098,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15142,7 +15142,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15200,7 +15200,7 @@ define <4 x float> @test_4xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15257,7 +15257,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -15298,7 +15298,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -15339,7 +15339,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -15393,7 +15393,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -15448,7 +15448,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15492,7 +15492,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15536,7 +15536,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15594,7 +15594,7 @@ define <8 x float> @test_8xfloat_masked_
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -16045,7 +16045,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -16086,7 +16086,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -16141,7 +16141,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16185,7 +16185,7 @@ define <2 x double> @test_2xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16242,7 +16242,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -16283,7 +16283,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -16324,7 +16324,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -16378,7 +16378,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -16433,7 +16433,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16477,7 +16477,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16521,7 +16521,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16579,7 +16579,7 @@ define <4 x double> @test_4xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16636,7 +16636,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16677,7 +16677,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16718,7 +16718,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16772,7 +16772,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -16827,7 +16827,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16871,7 +16871,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16915,7 +16915,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16973,7 +16973,7 @@ define <8 x double> @test_8xdouble_maske
 ; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>

Modified: llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll Sat Dec  9 17:24:08 2017
@@ -1792,7 +1792,7 @@ define i64 @test_mul_spec(i64 %x) nounwi
 ; X64-HSW-NEXT:    addq $42, %rcx # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq $2, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [4:1.00]
+; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_spec:
@@ -1840,7 +1840,7 @@ define i64 @test_mul_spec(i64 %x) nounwi
 ; HSW-NOOPT-NEXT:    addq $42, %rcx # sched: [1:0.25]
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    addq $2, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
 ; HSW-NOOPT-NEXT:    retq # sched: [7:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_spec:

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Sat Dec  9 17:24:08 2017
@@ -270,7 +270,7 @@ define float @f32_two_step(float %x) #2
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
 ; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
@@ -535,7 +535,7 @@ define <4 x float> @v4f32_two_step(<4 x
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
 ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
@@ -823,7 +823,7 @@ define <8 x float> @v8f32_two_step(<8 x
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
 ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll Sat Dec  9 17:24:08 2017
@@ -382,7 +382,7 @@ define float @f32_two_step_2(float %x) #
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm2 # sched: [4:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ss %xmm1, %xmm0, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ss %xmm2, %xmm2, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ss %xmm1, %xmm3, %xmm0 # sched: [4:0.33]
@@ -710,7 +710,7 @@ define <4 x float> @v4f32_two_step2(<4 x
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
 ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
@@ -1069,7 +1069,7 @@ define <8 x float> @v8f32_two_step2(<8 x
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
 ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]

Modified: llvm/trunk/test/CodeGen/X86/sha-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sha-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sha-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sha-schedule.ll Sat Dec  9 17:24:08 2017
@@ -219,11 +219,11 @@ define <4 x i32> @test_sha256rnds2(<4 x
 ;
 ; CANNONLAKE-LABEL: test_sha256rnds2:
 ; CANNONLAKE:       # %bb.0:
-; CANNONLAKE-NEXT:    vmovaps %xmm0, %xmm3 # sched: [1:1.00]
-; CANNONLAKE-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CANNONLAKE-NEXT:    vmovaps %xmm0, %xmm3 # sched: [1:0.33]
+; CANNONLAKE-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; CANNONLAKE-NEXT:    sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [5:1.00]
 ; CANNONLAKE-NEXT:    sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [10:1.00]
-; CANNONLAKE-NEXT:    vmovaps %xmm3, %xmm0 # sched: [1:1.00]
+; CANNONLAKE-NEXT:    vmovaps %xmm3, %xmm0 # sched: [1:0.33]
 ; CANNONLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_sha256rnds2:

Modified: llvm/trunk/test/CodeGen/X86/sse-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-schedule.ll?rev=320279&r1=320278&r2=320279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll Sat Dec  9 17:24:08 2017
@@ -2134,7 +2134,7 @@ define <4 x float> @test_movss_reg(<4 x
 ;
 ; SKX-LABEL: test_movss_reg:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movss_reg:




More information about the llvm-commits mailing list