[llvm] r306414 - Updated and extended the information about each instruction in HSW and SNB to include the following data:

Gadi Haber via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 27 08:05:13 PDT 2017


Author: gadi.haber
Date: Tue Jun 27 08:05:13 2017
New Revision: 306414

URL: http://llvm.org/viewvc/llvm-project?rev=306414&view=rev
Log:
Updated and extended the information about each instruction in HSW and SNB to include the following data:
•static latency
•number of uOps from which the instructions consists
•all ports used by the instruction

Reviewers: 
 RKSimon 
 zvi  
aymanmus  
m_zuckerman 

Differential Revision: https://reviews.llvm.org/D33897
 

Modified:
    llvm/trunk/lib/Target/X86/X86SchedHaswell.td
    llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
    llvm/trunk/test/CodeGen/X86/avx-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
    llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
    llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll
    llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
    llvm/trunk/test/CodeGen/X86/fp128-i128.ll
    llvm/trunk/test/CodeGen/X86/gather-addresses.ll
    llvm/trunk/test/CodeGen/X86/half.ll
    llvm/trunk/test/CodeGen/X86/illegal-bitfield-loadstore.ll
    llvm/trunk/test/CodeGen/X86/mul-constant-i32.ll
    llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll
    llvm/trunk/test/CodeGen/X86/pr32329.ll
    llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
    llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
    llvm/trunk/test/CodeGen/X86/sse-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse3-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse41-schedule.ll
    llvm/trunk/test/CodeGen/X86/sse42-schedule.ll
    llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v32.ll

Modified: llvm/trunk/lib/Target/X86/X86SchedHaswell.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedHaswell.td?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedHaswell.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td Tue Jun 27 08:05:13 2017
@@ -23,8 +23,8 @@ def HaswellModel : SchedMachineModel {
   // Based on the LSD (loop-stream detector) queue size and benchmarking data.
   let LoopMicroOpBufferSize = 50;
 
-  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
-  // the scheduler to assign a default model to unrecognized opcodes.
+  // This flag is set to allow the scheduler to assign a default model to 
+  // unrecognized opcodes.
   let CompleteModel = 0;
 }
 
@@ -267,1914 +267,3251 @@ def : WriteRes<WriteMicrocoded, [HWPort0
 def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
 def : WriteRes<WriteNop, []>;
 
-//================ Exceptions ================//
-
-//-- Specific Scheduling Models --//
-
-// Starting with P0.
-def WriteP0 : SchedWriteRes<[HWPort0]>;
-
-def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
 
-def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
-  let Latency = 8;
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
+  let Latency = 5;
   let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+  let ResourceCycles = [1, 2];
 }
 
-def WriteP01 : SchedWriteRes<[HWPort01]>;
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
 
-def Write2P01 : SchedWriteRes<[HWPort01]> {
-  let NumMicroOps = 2;
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
 }
-def Write3P01 : SchedWriteRes<[HWPort01]> {
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
   let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
 }
 
-def WriteP015 : SchedWriteRes<[HWPort015]>;
+// Remaining instrs.
 
-def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
-  let NumMicroOps = 2;
+def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
+  let Latency = 0;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def WriteP06 : SchedWriteRes<[HWPort06]>;
+def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV32rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV8rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSX32rm16")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSX32rm8")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVZX32rm16")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVZX32rm8")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHNTA")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT0")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT1")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT2")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTF128")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTI128")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQAYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>;
+
+def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>;
 
-def Write2P06 : SchedWriteRes<[HWPort06]> {
+def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
   let Latency = 1;
-  let NumMicroOps = 2;
-  let ResourceCycles = [2];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDrm")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQrm")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWrm")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>;
 
-def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
+def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup3], (instregex "MASKMOVDQU64")>;
+def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
 
-def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
-  let NumMicroOps = 2;
+def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "KORTESTBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VGATHERQPSZrm")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrm")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrm")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>;
 
-def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup5], (instregex "JMP64r")>;
 
-def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
-  let Latency = 6;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>;
+def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>;
 
-def Write5P0156 : SchedWriteRes<[HWPort0156]> {
-  let NumMicroOps = 5;
-  let ResourceCycles = [5];
+def HWWriteResGroup7 : SchedWriteRes<[HWPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup7], (instregex "BT32ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BT32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC32ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR32ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS32ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CQO")>;
+def: InstRW<[HWWriteResGroup7], (instregex "RORX32ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "RORX64ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR32ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR64r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SARX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SARX64rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL32ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL64r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHLX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHLX64rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR32ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR64r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHRX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHRX64rr")>;
 
-def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
   let Latency = 1;
-  let ResourceCycles = [1, 2, 1];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSI32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSI64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSR32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSR64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BZHI32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BZHI64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "LEA64_32r")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>;
 
-def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
   let Latency = 1;
-  let ResourceCycles = [2, 2, 1];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>;
 
-def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
   let Latency = 1;
-  let ResourceCycles = [3, 2, 1];
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
+def: InstRW<[HWWriteResGroup10], (instregex "ADD32ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD32rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND32ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND64ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP16ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP32i32")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC64r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC64r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV32rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX32rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX32rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX32rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX32rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG64r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT64r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR64ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SLDT16m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB64ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XCHG64rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR32rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR64ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>;
 
-// Starting with P1.
-def WriteP1 : SchedWriteRes<[HWPort1]>;
-
-def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 1;
   let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
-  let Latency = 3;
-}
-def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
-  let Latency = 7;
-}
+def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "CVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLDYri")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLQYri")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLWYri")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRADYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRAWYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLDYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLWYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSrm")>;
 
-def Write2P1 : SchedWriteRes<[HWPort1]> {
+def HWWriteResGroup12 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 1;
   let NumMicroOps = 2;
-  let ResourceCycles = [2];
-}
-def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
-}
-def WriteP15 : SchedWriteRes<[HWPort15]>;
-def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
-  let Latency = 4;
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup12], (instregex "ANDNPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ANDNPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ANDPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ANDPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "INSERTPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MOVHPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MOVHPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MOVLPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MOVLPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ORPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ORPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PINSRBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PINSRDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PINSRQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PINSRWrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PSHUFBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PSHUFDmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "SHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "SHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDNPDYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDNPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDNPSYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDNPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VANDPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VINSERTPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMOVHPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMOVHPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMOVLPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMOVLPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VORPDYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VORPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VORPSYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VORPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPDYri")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPDmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPDmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPDri")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPSYri")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPSmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPSmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPERMILPSri")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPINSRBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPINSRDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPINSRQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPINSRWrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFBrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFDYmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFDmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VXORPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VXORPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VXORPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VXORPSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "XORPDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "XORPSrm")>;
 
-def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 4;
+def HWWriteResGroup13 : SchedWriteRes<[HWPort6,HWPort23]> {
+  let Latency = 1;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup13], (instregex "FARJMP64")>;
+def: InstRW<[HWWriteResGroup13], (instregex "JMP64m")>;
 
-def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup14 : SchedWriteRes<[HWPort23,HWPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup14], (instregex "BT64mi8")>;
+def: InstRW<[HWWriteResGroup14], (instregex "RORX32mi")>;
+def: InstRW<[HWWriteResGroup14], (instregex "RORX64mi")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SARX32rm")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SARX64rm")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SHLX32rm")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SHLX64rm")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SHRX32rm")>;
+def: InstRW<[HWWriteResGroup14], (instregex "SHRX64rm")>;
 
-def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 6;
+def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort15]> {
+  let Latency = 1;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup15], (instregex "ANDN32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "ANDN64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSI32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSI64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSMSK32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSMSK64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSR32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BLSR64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BZHI32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "BZHI64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDDirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDQirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PADDWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBUSWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "MOVBE64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PABSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PABSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PABSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDUSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDUSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PADDWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PAVGBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PAVGWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXUBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXUDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMAXUWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINUBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINUDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PMINUWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "PSUBWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPABSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDQYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDUSBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDUSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDUSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDUSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPADDWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPAVGBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPAVGBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPAVGWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPAVGWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMAXUWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPMINUWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBDYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBDrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBQYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBQrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBSBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBWYrm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "VPSUBWrm")>;
 
-def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 10;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort015]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup16], (instregex "BLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PANDNirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PANDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PORirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PXORirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PANDNrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PANDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PORrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PXORrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VBLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VBLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VINSERTI128rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPANDNYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPANDNrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPANDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPANDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPBLENDDrmi")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPORYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPORrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPXORYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPXORrm")>;
 
-// Starting with P2.
-def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort0156]> {
   let Latency = 1;
-  let ResourceCycles = [2, 1];
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup17], (instregex "ADD64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "ADD8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "AND64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "AND8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP64mi8")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP64mr")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP8mi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP8mr")>;
+def: InstRW<[HWWriteResGroup17], (instregex "CMP8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "OR64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "OR8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "POP64r")>;
+def: InstRW<[HWWriteResGroup17], (instregex "SUB64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "SUB8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "TEST64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "TEST8mi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "TEST8rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "XOR64rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "XOR8rm")>;
 
-// Starting with P5.
-def WriteP5 : SchedWriteRes<[HWPort5]>;
-def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
-  let Latency = 5;
+def HWWriteResGroup18 : SchedWriteRes<[HWPort237,HWPort0156]> {
+  let Latency = 1;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
+def: InstRW<[HWWriteResGroup18], (instregex "SFENCE")>;
 
-// Notation:
-// - r: register.
-// - mm: 64 bit mmx register.
-// - x = 128 bit xmm register.
-// - (x)mm = mmx or xmm register.
-// - y = 256 bit ymm register.
-// - v = any vector register.
-// - m = memory.
-
-//=== Integer Instructions ===//
-//-- Move instructions --//
-
-// MOV.
-// r16,m.
-def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
-
-// MOVSX, MOVZX.
-// r,m.
-def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
-
-// CMOVcc.
-// r,r.
-def : InstRW<[Write2P0156_Lat2],
-      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
-      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
-
-// XCHG.
-// r,r.
-def WriteXCHG : SchedWriteRes<[HWPort0156]> {
-  let Latency = 2;
-  let ResourceCycles = [3];
+def HWWriteResGroup19 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
+def: InstRW<[HWWriteResGroup19], (instregex "EXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "PEXTRBmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "PEXTRDmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "PEXTRQmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "PEXTRWmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "STMXCSR")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VPEXTRBmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VPEXTRDmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VPEXTRQmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VPEXTRWmr")>;
+def: InstRW<[HWWriteResGroup19], (instregex "VSTMXCSR")>;
 
-def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup20], (instregex "FNSTCW16m")>;
 
-// r,m.
-def WriteXCHGrm : SchedWriteRes<[]> {
-  let Latency = 21;
-  let NumMicroOps = 8;
+def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort237,HWPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETAEm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETBm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETEm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETGEm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETGm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETLEm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETLm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETNEm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETNOm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETNPm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETNSm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETOm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETPm")>;
+def: InstRW<[HWWriteResGroup21], (instregex "SETSm")>;
 
-// XLAT.
-def WriteXLAT : SchedWriteRes<[]> {
-  let Latency = 7;
+def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
+  let Latency = 1;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+def: InstRW<[HWWriteResGroup22], (instregex "MOVBE64mr")>;
 
-// PUSH.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup23], (instregex "PUSH64i8")>;
+def: InstRW<[HWWriteResGroup23], (instregex "PUSH64r")>;
+def: InstRW<[HWWriteResGroup23], (instregex "STOSB")>;
+def: InstRW<[HWWriteResGroup23], (instregex "STOSL")>;
+def: InstRW<[HWWriteResGroup23], (instregex "STOSQ")>;
+def: InstRW<[HWWriteResGroup23], (instregex "STOSW")>;
 
-// PUSHF.
-def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0]> {
+  let Latency = 1;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+def: InstRW<[HWWriteResGroup24], (instregex "BTC64mi8")>;
+def: InstRW<[HWWriteResGroup24], (instregex "BTR64mi8")>;
+def: InstRW<[HWWriteResGroup24], (instregex "BTS64mi8")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SAR64m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SAR64mi")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SAR8m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SAR8mi")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHL64m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHL64mi")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHL8m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHL8mi")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHR64m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHR64mi")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHR8m1")>;
+def: InstRW<[HWWriteResGroup24], (instregex "SHR8mi")>;
 
-// PUSHA.
-def WritePushA : SchedWriteRes<[]> {
-  let NumMicroOps = 19;
+def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 1;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+def: InstRW<[HWWriteResGroup25], (instregex "ADD64mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "ADD64mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "ADD8mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "AND64mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "AND64mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "AND8mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "DEC64m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "DEC8m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "INC64m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "INC8m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "NEG64m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "NEG8m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "NOT64m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "NOT8m")>;
+def: InstRW<[HWWriteResGroup25], (instregex "OR64mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "OR64mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "OR8mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "POP64rmm")>;
+def: InstRW<[HWWriteResGroup25], (instregex "PUSH64rmm")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SUB64mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SUB64mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SUB8mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "XOR64mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "XOR64mr")>;
+def: InstRW<[HWWriteResGroup25], (instregex "XOR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "XOR8mr")>;
 
-// POP.
-// m.
-def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
-
-// POPF.
-def WritePopF : SchedWriteRes<[]> {
-  let NumMicroOps = 9;
+def HWWriteResGroup26 : SchedWriteRes<[HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
 }
-def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+def: InstRW<[HWWriteResGroup26], (instregex "BLENDVPDrr0")>;
+def: InstRW<[HWWriteResGroup26], (instregex "BLENDVPSrr0")>;
+def: InstRW<[HWWriteResGroup26], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PBLENDVBrr0")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PINSRBrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PINSRDrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PINSRQrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PINSRWrri")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VBLENDVPDrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VBLENDVPSrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPBLENDVBrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPINSRBrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPINSRDrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPINSRQrr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "VPINSRWrri")>;
 
-// POPA.
-def WritePopA : SchedWriteRes<[]> {
-  let NumMicroOps = 18;
+def HWWriteResGroup27 : SchedWriteRes<[HWPort01]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
 }
-def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
-
-// LAHF SAHF.
-def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+def: InstRW<[HWWriteResGroup27], (instregex "FDECSTP")>;
 
-// BSWAP.
-// r32.
-def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
-def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
-
-// r64.
-def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+def HWWriteResGroup28 : SchedWriteRes<[HWPort0]> {
+  let Latency = 2;
   let NumMicroOps = 2;
+  let ResourceCycles = [2];
 }
-def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
-
-// MOVBE.
-// r16,m16 / r64,m64.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROL32ri")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROL64r1")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROL8r1")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROL8ri")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROR32ri")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROR64r1")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROR8r1")>;
+def: InstRW<[HWWriteResGroup28], (instregex "ROR8ri")>;
 
-// r32, m32.
-def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+def HWWriteResGroup29 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
   let NumMicroOps = 2;
+  let ResourceCycles = [2];
 }
-def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+def: InstRW<[HWWriteResGroup29], (instregex "LFENCE")>;
+def: InstRW<[HWWriteResGroup29], (instregex "MFENCE")>;
+def: InstRW<[HWWriteResGroup29], (instregex "WAIT")>;
+def: InstRW<[HWWriteResGroup29], (instregex "XGETBV")>;
 
-// m16,r16.
-def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
-  let NumMicroOps = 3;
+def HWWriteResGroup30 : SchedWriteRes<[HWPort0,HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PEXTRBrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PEXTRDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PEXTRQrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PEXTRWri")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSLLDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSLLQrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSLLWrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSRADrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSRAWrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSRLDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSRLQrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PSRLWrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "PTESTrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPEXTRBrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPEXTRDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPEXTRQrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPEXTRWri")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPSRADrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPSRAWrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPSRLDrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPSRLQrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPSRLWrr")>;
+def: InstRW<[HWWriteResGroup30], (instregex "VPTESTrr")>;
 
-// m32,r32.
-def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
-  let NumMicroOps = 3;
+def HWWriteResGroup31 : SchedWriteRes<[HWPort6,HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "CLFLUSH")>;
 
-// m64,r64.
-def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
-  let NumMicroOps = 4;
+def HWWriteResGroup32 : SchedWriteRes<[HWPort01,HWPort015]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
-
-//-- Arithmetic instructions --//
-
-// ADD SUB.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
-              (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
-              "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
-
-// ADC SBB.
-// r,r/i.
-def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
-                           "(ADC|SBB)(16|32|64)ri8",
-                           "(ADC|SBB)64ri32",
-                           "(ADC|SBB)(8|16|32|64)rr_REV")>;
+def: InstRW<[HWWriteResGroup32], (instregex "MMX_MOVDQ2Qrr")>;
 
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
-
-// m,r/i.
-def : InstRW<[Write3P0156_2P237_P4],
-             (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
-              "(ADC|SBB)(16|32|64)mi8",
-              "(ADC|SBB)64mi32")>;
-
-// INC DEC NOT NEG.
-// m.
-def : InstRW<[WriteP0156_2P237_P4],
-             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
-              "(INC|DEC)64(16|32)m")>;
+def HWWriteResGroup33 : SchedWriteRes<[HWPort0,HWPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup33], (instregex "BEXTR32rr")>;
+def: InstRW<[HWWriteResGroup33], (instregex "BEXTR64rr")>;
+def: InstRW<[HWWriteResGroup33], (instregex "BSWAP32r")>;
 
-// MUL IMUL.
-// r16.
-def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
-  let Latency = 4;
-  let NumMicroOps = 4;
+def HWWriteResGroup34 : SchedWriteRes<[HWPort0,HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+def: InstRW<[HWWriteResGroup34], (instregex "ADC64ri8")>;
+def: InstRW<[HWWriteResGroup34], (instregex "ADC64rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "ADC8ri")>;
+def: InstRW<[HWWriteResGroup34], (instregex "ADC8rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVAE32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVB32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVE32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVG32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVGE32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVL32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVLE32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVNE32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVNO32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVNP32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVNS32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVO32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVP32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CMOVS32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "CWD")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SBB32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SBB64ri8")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SBB8ri")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SBB8rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SETAr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "SETBEr")>;
 
-// m16.
-def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 5;
+def HWWriteResGroup35 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+def: InstRW<[HWWriteResGroup35], (instregex "BLENDVPDrm0")>;
+def: InstRW<[HWWriteResGroup35], (instregex "BLENDVPSrm0")>;
+def: InstRW<[HWWriteResGroup35], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "PBLENDVBrm0")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VBLENDVPDrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VBLENDVPSrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPBLENDVBrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[HWWriteResGroup35], (instregex "VPMASKMOVQrm")>;
 
-// r32.
-def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
-  let Latency = 4;
+def HWWriteResGroup36 : SchedWriteRes<[HWPort23,HWPort0156]> {
+  let Latency = 2;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
 }
-def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+def: InstRW<[HWWriteResGroup36], (instregex "LEAVE64")>;
+def: InstRW<[HWWriteResGroup36], (instregex "SCASB")>;
+def: InstRW<[HWWriteResGroup36], (instregex "SCASL")>;
+def: InstRW<[HWWriteResGroup36], (instregex "SCASQ")>;
+def: InstRW<[HWWriteResGroup36], (instregex "SCASW")>;
 
-// m32.
-def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 4;
+def HWWriteResGroup37 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSLLDrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSLLQrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSLLWrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSRADrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSRAWrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSRLDrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSRLQrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PSRLWrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "PTESTrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSLLDri")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSLLQri")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSLLWri")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSRADrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSRAWrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSRLDrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSRLQrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPSRLWrm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "VPTESTrm")>;
 
-// r64.
-def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
+def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+def: InstRW<[HWWriteResGroup38], (instregex "FLDCW16m")>;
 
-// m64.
-def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
-  let Latency = 7;
+def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> {
+  let Latency = 2;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+def: InstRW<[HWWriteResGroup39], (instregex "LDMXCSR")>;
+def: InstRW<[HWWriteResGroup39], (instregex "VLDMXCSR")>;
 
-// r16,r16.
-def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
+def HWWriteResGroup40 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+def: InstRW<[HWWriteResGroup40], (instregex "LRETQ")>;
+def: InstRW<[HWWriteResGroup40], (instregex "RETQ")>;
 
-// r16,m16.
-def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
-  let Latency = 8;
+def HWWriteResGroup41 : SchedWriteRes<[HWPort23,HWPort0,HWPort15]> {
+  let Latency = 2;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+def: InstRW<[HWWriteResGroup41], (instregex "BEXTR32rm")>;
+def: InstRW<[HWWriteResGroup41], (instregex "BEXTR64rm")>;
 
-// MULX.
-// r32,r32,r32.
-def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
-  let Latency = 4;
+def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort0,HWPort0156]> {
+  let Latency = 2;
   let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+def: InstRW<[HWWriteResGroup42], (instregex "ADC64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "ADC8rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVAE64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVB64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVE64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVG64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVGE64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVL64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVLE64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVNE64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVNO64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVNP64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVNS64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVO64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVP64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "CMOVS64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "SBB64rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "SBB8rm")>;
 
-// r32,r32,m32.
-def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
-  let Latency = 8;
+def HWWriteResGroup43 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
+  let Latency = 2;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 2, 1];
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CALL64r")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SETAm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SETBEm")>;
 
-// r64,r64,r64.
-def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
+def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0]> {
+  let Latency = 2;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
 }
-def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROL64m1")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROL64mi")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROL8m1")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROL8mi")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROR64m1")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROR64mi")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROR8m1")>;
+def: InstRW<[HWWriteResGroup44], (instregex "ROR8mi")>;
 
-// r64,r64,m64.
-def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
-  let Latency = 8;
-  let NumMicroOps = 3;
+def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
 }
-def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+def: InstRW<[HWWriteResGroup45], (instregex "XADD64rm")>;
+def: InstRW<[HWWriteResGroup45], (instregex "XADD8rm")>;
 
-// DIV.
-// r8.
-def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 22;
-  let NumMicroOps = 9;
+def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,1,1];
 }
-def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+def: InstRW<[HWWriteResGroup46], (instregex "CALL64m")>;
+def: InstRW<[HWWriteResGroup46], (instregex "FARCALL64")>;
 
-// r16.
-def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 23;
-  let NumMicroOps = 10;
+def HWWriteResGroup47 : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+def: InstRW<[HWWriteResGroup47], (instregex "MOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "MOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "VPMOVMSKBrr")>;
 
-// r32.
-def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 22;
-  let NumMicroOps = 10;
+def HWWriteResGroup48 : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "ADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "BSF32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "BSR32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CMPPDrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CMPPSrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CMPSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CMPSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "COMISDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "COMISSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "IMUL32rri8")>;
+def: InstRW<[HWWriteResGroup48], (instregex "IMUL64rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "IMUL8r")>;
+def: InstRW<[HWWriteResGroup48], (instregex "LZCNT32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MAXPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MAXPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MAXSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MAXSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MINPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MINPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MINSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MINSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "MUL8r")>;
+def: InstRW<[HWWriteResGroup48], (instregex "PDEP32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "PDEP64rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "PEXT32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "PEXT64rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "POPCNT32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SHLD32rri8")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SHRD32rri8")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SUBPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SUBPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SUBSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "SUBSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "TZCNT32rr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "UCOMISDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "UCOMISSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDPDYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDPSYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPPDYrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPPDrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPPSYrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPPSrri")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCMPSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCOMISDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCOMISSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXPDYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXPSYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMAXSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMINPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMINPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMINSDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VMINSSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VSUBPDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VSUBPSrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VUCOMISDrr")>;
+def: InstRW<[HWWriteResGroup48], (instregex "VUCOMISSrr")>;
 
-// r64.
-def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 32;
-  let NumMicroOps = 36;
-}
-def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
-
-// IDIV.
-// r8.
-def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 23;
-  let NumMicroOps = 9;
+def HWWriteResGroup49 : SchedWriteRes<[HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+def: InstRW<[HWWriteResGroup49], (instregex "KSHIFTRDri")>;
+def: InstRW<[HWWriteResGroup49], (instregex "KSHIFTRWri")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VINSERTF128rr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VINSERTI128rr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPERM2I128rr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPERMDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPERMQYri")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVZXWQYrr")>;
 
-// r16.
-def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 23;
-  let NumMicroOps = 10;
+def HWWriteResGroup50 : SchedWriteRes<[HWPort1,HWPort23]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSF64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSR64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL64m")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL8m")>;
+def: InstRW<[HWWriteResGroup50], (instregex "LZCNT64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MUL64m")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MUL8m")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PDEP32rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PDEP64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PEXT32rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PEXT64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "POPCNT64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "TZCNT64rm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrmi")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrm")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrm")>;
 
-// r32.
-def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 22;
-  let NumMicroOps = 9;
+def HWWriteResGroup51 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYmi")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYmi")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrm")>;
 
-// r64.
-def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
-  let Latency = 39;
-  let NumMicroOps = 59;
-}
-def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
-
-//-- Logic instructions --//
-
-// AND OR XOR.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
-             (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
-              "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
-
-// SHR SHL SAR.
-// m,i.
-def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup52 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
 }
-def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+def: InstRW<[HWWriteResGroup52], (instregex "XADD32rr")>;
+def: InstRW<[HWWriteResGroup52], (instregex "XADD8rr")>;
+def: InstRW<[HWWriteResGroup52], (instregex "XCHG8rr")>;
 
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
-
-// m,cl.
-def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
-  let NumMicroOps = 6;
-  let ResourceCycles = [3, 2, 1];
+def HWWriteResGroup53 : SchedWriteRes<[HWPort0,HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSLLVDYrr")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSLLVDrr")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSRAVDYrr")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSRAVDrr")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSRLVDYrr")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPSRLVDrr")>;
 
-// ROR ROL.
-// r,1.
-def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
-
-// m,i.
-def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
-  let NumMicroOps = 5;
-  let ResourceCycles = [2, 2, 1];
+def HWWriteResGroup54 : SchedWriteRes<[HWPort5,HWPort15]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHADDDrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHADDWrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHSUBDrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup54], (instregex "PHSUBWrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDDYrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDDrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDSWrr256")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDWYrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHADDWrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBDYrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBDrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBSWrr256")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBWYrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "VPHSUBWrr")>;
 
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+def HWWriteResGroup55 : SchedWriteRes<[HWPort5,HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup55], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "MMX_PACKUSWBirr")>;
 
-// m,cl.
-def WriteRotateRMWCL : SchedWriteRes<[]> {
-  let NumMicroOps = 6;
+def HWWriteResGroup56 : SchedWriteRes<[HWPort6,HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
 }
-def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup56], (instregex "CLD")>;
 
-// RCR RCL.
-// r,1.
-def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
-  let Latency = 2;
+def HWWriteResGroup57 : SchedWriteRes<[HWPort0,HWPort0156]> {
+  let Latency = 3;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+  let ResourceCycles = [1,2];
 }
-def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup57], (instregex "CMOVA32rr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "CMOVBE32rr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCL32ri")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCL64r1")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCL8r1")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCL8ri")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCR32ri")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCR64r1")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCR8r1")>;
+def: InstRW<[HWWriteResGroup57], (instregex "RCR8ri")>;
+def: InstRW<[HWWriteResGroup57], (instregex "SHL64rCL")>;
+def: InstRW<[HWWriteResGroup57], (instregex "SHL8rCL")>;
 
-// m,1.
-def WriteRCm1 : SchedWriteRes<[]> {
-  let NumMicroOps = 6;
+def HWWriteResGroup58 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup58], (instregex "FNSTSWm")>;
 
-// r,i.
-def WriteRCri : SchedWriteRes<[HWPort0156]> {
-  let Latency = 6;
-  let NumMicroOps = 8;
+def HWWriteResGroup59 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSLLVDYrm")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSLLVDrm")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSRAVDYrm")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSRAVDrm")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSRLVDYrm")>;
+def: InstRW<[HWWriteResGroup59], (instregex "VPSRLVDrm")>;
 
-// m,i.
-def WriteRCmi : SchedWriteRes<[]> {
-  let NumMicroOps = 11;
+def HWWriteResGroup60 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "MMX_PHSUBWrm64")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHADDDrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHADDWrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHSUBDrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup60], (instregex "PHSUBWrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDDYrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDDrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDSWrm256")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDWYrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHADDWrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBDYrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBDrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBSWrm256")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBWYrm")>;
+def: InstRW<[HWWriteResGroup60], (instregex "VPHSUBWrm")>;
 
-// SHRD SHLD.
-// r,r,i.
-def WriteShDrr : SchedWriteRes<[HWPort1]> {
+def HWWriteResGroup61 : SchedWriteRes<[HWPort23,HWPort0,HWPort0156]> {
   let Latency = 3;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+def: InstRW<[HWWriteResGroup61], (instregex "CMOVA64rm")>;
+def: InstRW<[HWWriteResGroup61], (instregex "CMOVBE64rm")>;
 
-// m,r,i.
-def WriteShDmr : SchedWriteRes<[]> {
+def HWWriteResGroup62 : SchedWriteRes<[HWPort23,HWPort237,HWPort0,HWPort0156]> {
+  let Latency = 3;
   let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
 }
-def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCL64m1")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCL64mi")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCL8m1")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCL8mi")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCR64m1")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCR64mi")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCR8m1")>;
+def: InstRW<[HWWriteResGroup62], (instregex "RCR8mi")>;
 
-// r,r,cl.
-def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+def HWWriteResGroup63 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
   let Latency = 3;
-  let NumMicroOps = 4;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,1,3];
 }
-def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[HWWriteResGroup63], (instregex "ADC64mi8")>;
+def: InstRW<[HWWriteResGroup63], (instregex "ADC8mi")>;
+def: InstRW<[HWWriteResGroup63], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup63], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup63], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup63], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup63], (instregex "XCHG64rm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "XCHG8rm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "XOR8mi")>;
 
-// r,r,cl.
-def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
-  let Latency = 4;
-  let NumMicroOps = 4;
+def HWWriteResGroup64 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0,HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,1,2,1];
 }
-def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+def: InstRW<[HWWriteResGroup64], (instregex "ADC64mr")>;
+def: InstRW<[HWWriteResGroup64], (instregex "ADC8mr")>;
+def: InstRW<[HWWriteResGroup64], (instregex "CMPXCHG64rm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "CMPXCHG8rm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SBB64mi8")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SBB64mr")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SBB8mi")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SBB8mr")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SHL64mCL")>;
+def: InstRW<[HWWriteResGroup64], (instregex "SHL8mCL")>;
 
-// m,r,cl.
-def WriteShDmrCL : SchedWriteRes<[]> {
-  let NumMicroOps = 7;
+def HWWriteResGroup65 : SchedWriteRes<[HWPort0,HWPort1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
-
-// BT.
-// r,r/i.
-def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "VCVTTSS2SIrr")>;
 
-// m,r.
-def WriteBTmr : SchedWriteRes<[]> {
-  let NumMicroOps = 10;
+def HWWriteResGroup66 : SchedWriteRes<[HWPort0,HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
-
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSLLDrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSLLQrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSLLWrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSRADYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSRAWYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSRLDYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSRLQYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPSRLWYrr")>;
+def: InstRW<[HWWriteResGroup66], (instregex "VPTESTYrr")>;
 
-// BTR BTS BTC.
-// r,r,i.
-def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
-
-// m,r.
-def WriteBTRSCmr : SchedWriteRes<[]> {
-  let NumMicroOps = 11;
+def HWWriteResGroup67 : SchedWriteRes<[HWPort1,HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTSD2SSrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTSI2SD64rr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTSI2SD64rr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup67], (instregex "VCVTTPD2DQrr")>;
 
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
-
-// BSF BSR.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
-
-// SETcc.
-// r.
-def : InstRW<[WriteShift],
-             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
-// m.
-def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
-  let NumMicroOps = 3;
+def HWWriteResGroup68 : SchedWriteRes<[HWPort1,HWPort6]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteSetCCm],
-             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+def: InstRW<[HWWriteResGroup68], (instregex "IMUL64r")>;
+def: InstRW<[HWWriteResGroup68], (instregex "MUL64r")>;
+def: InstRW<[HWWriteResGroup68], (instregex "MULX64rr")>;
 
-// CLD STD.
-def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+def HWWriteResGroup69 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+  let Latency = 4;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
-
-// LZCNT TZCNT.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
-
-// ANDN.
-// r,r.
-def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "VCVTTSS2SIrm")>;
 
-// BLSI BLSMSK BLSR.
-// r,r.
-def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
-
-// BEXTR.
-// r,r,r.
-def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
-// r,m,r.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
-
-// BZHI.
-// r,r,r.
-def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
-
-// PDEP PEXT.
-// r,r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
-
-//-- Control transfer instructions --//
-
-// J(E|R)CXZ.
-def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
-  let NumMicroOps = 2;
+def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VPTESTYrm")>;
 
-// LOOP.
-def WriteLOOP : SchedWriteRes<[]> {
-  let NumMicroOps = 7;
+def HWWriteResGroup71 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+def: InstRW<[HWWriteResGroup71], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "CVTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "CVTPD2PSrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "CVTSD2SSrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VCVTSD2SSrm")>;
 
-// LOOP(N)E
-def WriteLOOPE : SchedWriteRes<[]> {
-  let NumMicroOps = 11;
+def HWWriteResGroup72 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+def: InstRW<[HWWriteResGroup72], (instregex "MULX64rm")>;
 
-// CALL.
-// r.
-def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+def HWWriteResGroup73 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
+  let Latency = 4;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VPBROADCASTWrm")>;
 
-// m.
-def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+def HWWriteResGroup74 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+  let ResourceCycles = [4];
 }
-def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+def: InstRW<[HWWriteResGroup74], (instregex "FNCLEX")>;
 
-// RET.
-def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
-  let NumMicroOps = 2;
-}
-def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
-
-// i.
-def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+def HWWriteResGroup75 : SchedWriteRes<[HWPort015,HWPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 2, 1];
-}
-def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
-
-// BOUND.
-// r,m.
-def WriteBOUND : SchedWriteRes<[]> {
-  let NumMicroOps = 15;
+  let ResourceCycles = [1,3];
 }
-def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+def: InstRW<[HWWriteResGroup75], (instregex "VZEROUPPER")>;
 
-// INTO.
-def WriteINTO : SchedWriteRes<[]> {
+def HWWriteResGroup76 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
 }
-def : InstRW<[WriteINTO], (instregex "INTO")>;
-
-//-- String instructions --//
-
-// LODSB/W.
-def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
-
-// LODSD/Q.
-def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
-
-// STOS.
-def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+def: InstRW<[HWWriteResGroup76], (instregex "LAR32rr")>;
 
-// MOVS.
-def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> {
   let Latency = 4;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2, 1, 2];
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
-
-// SCAS.
-def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VPMASKMOVQmr")>;
 
-// CMPS.
-def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
   let Latency = 4;
-  let NumMicroOps = 5;
-  let ResourceCycles = [2, 3];
-}
-def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
-
-//-- Synchronization instructions --//
-
-// XADD.
-def WriteXADD : SchedWriteRes<[]> {
-  let NumMicroOps = 5;
-}
-def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
-
-// CMPXCHG.
-def WriteCMPXCHG : SchedWriteRes<[]> {
-  let NumMicroOps = 6;
-}
-def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
-
-// CMPXCHG8B.
-def WriteCMPXCHG8B : SchedWriteRes<[]> {
-  let NumMicroOps = 15;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+def: InstRW<[HWWriteResGroup78], (instregex "VCVTPS2PHmr")>;
 
-// CMPXCHG16B.
-def WriteCMPXCHG16B : SchedWriteRes<[]> {
-  let NumMicroOps = 22;
+def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
-
-//-- Other --//
+def: InstRW<[HWWriteResGroup79], (instregex "SHLD64mri8")>;
+def: InstRW<[HWWriteResGroup79], (instregex "SHRD64mri8")>;
 
-// PAUSE.
-def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+def HWWriteResGroup80 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
+  let Latency = 4;
   let NumMicroOps = 5;
-  let ResourceCycles = [1, 3];
-}
-def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
-
-// LEAVE.
-def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
-
-// XGETBV.
-def WriteXGETBV : SchedWriteRes<[]> {
-  let NumMicroOps = 8;
-}
-def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
-
-// RDTSC.
-def WriteRDTSC : SchedWriteRes<[]> {
-  let NumMicroOps = 15;
-}
-def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
-
-// RDPMC.
-def WriteRDPMC : SchedWriteRes<[]> {
-  let NumMicroOps = 34;
+  let ResourceCycles = [1,2,1,1];
 }
-def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
-
-// RDRAND.
-def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
-  let NumMicroOps = 17;
-  let ResourceCycles = [1, 16];
-}
-def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
-
-//=== Floating Point x87 Instructions ===//
-//-- Move instructions --//
-
-// FLD.
-// m80.
-def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+def: InstRW<[HWWriteResGroup80], (instregex "LAR32rm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "LSL32rm")>;
 
-def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+def HWWriteResGroup81 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
   let Latency = 4;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 2];
-}
-def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
-
-// FBLD.
-// m80.
-def WriteFBLD : SchedWriteRes<[]> {
-  let Latency = 47;
-  let NumMicroOps = 43;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,4];
 }
-def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+def: InstRW<[HWWriteResGroup81], (instregex "PUSHF16")>;
+def: InstRW<[HWWriteResGroup81], (instregex "PUSHF64")>;
 
-// FST(P).
-// r.
-def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
-
-// m80.
-def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
-  let NumMicroOps = 7;
-  let ResourceCycles = [3, 2, 2];
+def HWWriteResGroup82 : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMADDWDrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULDQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULHUWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULHWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULLWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PMULUDQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "PSADBWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "RCPPSr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "RCPSSr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "RSQRTPSr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "RSQRTSSr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMADDWDYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMADDWDrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULDQYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULDQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULHUWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULHWYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULHWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULLWYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULLWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPMULUDQrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPSADBWYrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VPSADBWrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VRSQRTPSr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VRSQRTSSr")>;
 
-// FBSTP.
-// m80.
-def WriteFBSTP : SchedWriteRes<[]> {
-  let NumMicroOps = 226;
+def HWWriteResGroup83 : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
-
-// FXCHG.
-def : InstRW<[WriteNop], (instregex "XCH_F")>;
+def: InstRW<[HWWriteResGroup83], (instregex "MULPDrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "MULPSrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "MULSDrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "MULSSrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD132SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD213SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADD231SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMADDSUB231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB132SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB213SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUB231SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFMSUBADD231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD132SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD213SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMADD231SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB132SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB213SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231PDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231PSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231SDr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VFNMSUB231SSr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULPDYrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULPDrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULPSYrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULPSrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULSDrr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "VMULSSrr")>;
 
-// FILD.
-def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
-  let Latency = 6;
+def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 5;
   let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
-
-// FIST(P) FISTTP.
-def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-}
-def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
-
-// FLDZ.
-def : InstRW<[WriteP01], (instregex "LD_F0")>;
-
-// FLD1.
-def : InstRW<[Write2P01], (instregex "LD_F1")>;
-
-// FLDPI FLDL2E etc.
-def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
-
-// FCMOVcc.
-def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMADDWDrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULDQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULHUWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULHWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULLWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PMULUDQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "PSADBWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "RCPPSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "RCPSSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "RSQRTPSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "RSQRTSSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMADDWDYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMADDWDrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULDQYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULDQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHUWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHUWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHWYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULHWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULLWYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULLWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULUDQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMULUDQrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPSADBWYrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPSADBWrm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VRCPPSm(_Int)?")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VRCPSSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VRSQRTPSm")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VRSQRTSSm")>;
 
-// FNSTSW.
-// AX.
-def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+def HWWriteResGroup85 : SchedWriteRes<[HWPort01,HWPort23]> {
+  let Latency = 5;
   let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+def: InstRW<[HWWriteResGroup85], (instregex "MULPDrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "MULPSrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "MULSDrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "MULSSrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD132SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD213SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADD231SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMADDSUB231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB132SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB213SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUB231SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFMSUBADD231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD132SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD213SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMADD231SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB132SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB213SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231PDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231PSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231SDm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VFNMSUB231SSm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULPDYrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULPDrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULPSYrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULPSrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULSDrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VMULSSrm")>;
 
-// m16.
-def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
-  let Latency = 6;
+def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort5]> {
+  let Latency = 5;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
 }
-def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+def: InstRW<[HWWriteResGroup86], (instregex "CVTSI2SS64rr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "HADDPDrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "HADDPSrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "HSUBPDrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "HSUBPSrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VCVTSI2SS64rr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHADDPDrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHADDPSYrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHADDPSrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHSUBPDrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup86], (instregex "VHSUBPSrr")>;
 
-// FLDCW.
-def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
-  let Latency = 7;
+def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort0]> {
+  let Latency = 5;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+def: InstRW<[HWWriteResGroup87], (instregex "STR32r")>;
 
-// FNSTCW.
-def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+def HWWriteResGroup88 : SchedWriteRes<[HWPort1,HWPort0,HWPort0156]> {
+  let Latency = 5;
   let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
-
-// FINCSTP FDECSTP.
-def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
-
-// FFREE.
-def : InstRW<[WriteP01], (instregex "FFREE")>;
-
-// FNSAVE.
-def WriteFNSAVE : SchedWriteRes<[]> {
-  let NumMicroOps = 147;
-}
-def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
-
-// FRSTOR.
-def WriteFRSTOR : SchedWriteRes<[]> {
-  let NumMicroOps = 90;
-}
-def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
-
-//-- Arithmetic instructions --//
-
-// FABS.
-def : InstRW<[WriteP0], (instregex "ABS_F")>;
-
-// FCHS.
-def : InstRW<[WriteP0], (instregex "CHS_F")>;
-
-// FCOM(P) FUCOM(P).
-// r.
-def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
-                         "UCOM_FPr")>;
-// m.
-def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
-
-// FCOMPP FUCOMPP.
-// r.
-def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
-
-// FCOMI(P) FUCOMI(P).
-// m.
-def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
-                           "UCOM_FIPr")>;
-
-// FICOM(P).
-def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
-
-// FTST.
-def : InstRW<[WriteP1], (instregex "TST_F")>;
-
-// FXAM.
-def : InstRW<[Write2P1], (instregex "FXAM")>;
+def: InstRW<[HWWriteResGroup88], (instregex "MULX32rr")>;
 
-// FPREM.
-def WriteFPREM : SchedWriteRes<[]> {
-  let Latency = 19;
-  let NumMicroOps = 28;
-}
-def : InstRW<[WriteFPREM], (instregex "FPREM")>;
-
-// FPREM1.
-def WriteFPREM1 : SchedWriteRes<[]> {
-  let Latency = 27;
-  let NumMicroOps = 41;
-}
-def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
-
-// FRNDINT.
-def WriteFRNDINT : SchedWriteRes<[]> {
-  let Latency = 11;
-  let NumMicroOps = 17;
-}
-def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
-
-//-- Math instructions --//
-
-// FSCALE.
-def WriteFSCALE : SchedWriteRes<[]> {
-  let Latency = 75; // 49-125
-  let NumMicroOps = 50; // 25-75
-}
-def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
-
-// FXTRACT.
-def WriteFXTRACT : SchedWriteRes<[]> {
-  let Latency = 15;
-  let NumMicroOps = 17;
+def HWWriteResGroup89 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
 }
-def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
-
-//-- Other instructions --//
+def: InstRW<[HWWriteResGroup89], (instregex "HADDPDrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "HADDPSrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "HSUBPDrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "HSUBPSrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHADDPDrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHADDPDrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHADDPSYrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHADDPSrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHSUBPDrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VHSUBPSrm")>;
 
-// FNOP.
-def : InstRW<[WriteP01], (instregex "FNOP")>;
-
-// WAIT.
-def : InstRW<[Write2P01], (instregex "WAIT")>;
-
-// FNCLEX.
-def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
-
-// FNINIT.
-def WriteFNINIT : SchedWriteRes<[]> {
-  let NumMicroOps = 26;
+def HWWriteResGroup90 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
-
-//=== Integer MMX and XMM Instructions ===//
-//-- Move instructions --//
-
-// MOVD.
-// r32/64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
-                         "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
-
-// (x)mm <- r32/64.
-def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
-                         "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "CVTTSS2SI64rm")>;
 
-// MOVQ.
-// r64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
-
-// (x)mm <- r64.
-def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
-
-// (x)mm <- (x)mm.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
-
-// (V)MOVDQA/U.
-// x <- x.
-def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
-                           "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
-                           "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
-
-// MOVDQ2Q.
-def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
-
-// MOVQ2DQ.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
-
-
-// PACKSSWB/DW.
-// mm <- mm.
-def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 3;
-  let ResourceCycles = [3];
+def HWWriteResGroup91 : SchedWriteRes<[HWPort1,HWPort23,HWPort0,HWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
-                                  "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MULX32rm")>;
 
-// mm <- m64.
-def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
-  let Latency = 4;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 3];
+def HWWriteResGroup92 : SchedWriteRes<[HWPort6,HWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,4];
 }
-def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
-                                  "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "PAUSE")>;
 
-// VPMOVSX/ZX BW BD BQ DW DQ.
-// y <- x.
-def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 1;
+def HWWriteResGroup93 : SchedWriteRes<[HWPort0,HWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,4];
 }
-def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "XSETBV")>;
 
-// PBLENDW.
-// x,x,i / v,v,v,i
-def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
-def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
-
-// x,m,i / v,v,m,i
-def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
-  let NumMicroOps = 2;
-  let Latency = 4;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup94 : SchedWriteRes<[HWPort0,HWPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2,3];
 }
-def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+def: InstRW<[HWWriteResGroup94], (instregex "CMPXCHG32rr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "CMPXCHG8rr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "ROUNDPDr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "ROUNDPSr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "ROUNDSDr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "ROUNDSSr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "VBROADCASTF128")>;
+def: InstRW<[HWWriteResGroup94], (instregex "VPBROADCASTMB2QZrr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "VROUNDPDr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "VROUNDPSr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "VROUNDSDr")>;
 
-// VPBLENDD.
-// v,v,v,i.
-def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
-def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
-
-// v,v,m,i
-def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort5]> {
+  let Latency = 6;
   let NumMicroOps = 2;
-  let Latency = 4;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
-
-// MASKMOVQ.
-def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 2];
-}
-def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
-
-// MASKMOVDQU.
-def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
-  let Latency = 14;
-  let NumMicroOps = 10;
-  let ResourceCycles = [4, 2, 4];
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup95], (instregex "ROUNDPDm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "ROUNDPSm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "ROUNDSDm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "ROUNDSSm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDPDm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDPDm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDPSm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDPSm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDSDm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "VROUNDSSm")>;
 
-// VPMASKMOV D/Q.
-// v,v,m.
-def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
-  let Latency = 4;
+def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+  let Latency = 6;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+  let ResourceCycles = [1,1,1];
 }
-def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
-                               (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VCVTDQ2PDYrm")>;
 
-// m, v,v.
-def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
-  let Latency = 13;
+def HWWriteResGroup97 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
+  let Latency = 6;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 1, 1];
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+def: InstRW<[HWWriteResGroup97], (instregex "VCVTPS2PHYmr")>;
 
-// PMOVMSKB.
-def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
-  let Latency = 3;
+def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort6,HWPort0,HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
 }
-def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+def: InstRW<[HWWriteResGroup98], (instregex "SLDT32r")>;
 
-// PEXTR B/W/D/Q.
-// r32,x,i.
-def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,5];
 }
-def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+def: InstRW<[HWWriteResGroup99], (instregex "STD")>;
 
-// m8,x,i.
-def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup100 : SchedWriteRes<[HWPort5]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "AESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "AESDECrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "AESENCLASTrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "AESENCrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "KANDQrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "VAESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "VAESDECrr")>;
+def: InstRW<[HWWriteResGroup100], (instregex "VAESENCrr")>;
 
-// VPBROADCAST B/W.
-// x, m8/16.
-def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup101 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
-                                     (instregex "VPBROADCAST(B|W)rm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "AESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "AESDECrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "AESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "AESENCrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VAESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VAESDECrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VAESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VAESENCrm")>;
 
-// y, m8/16
-def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+def HWWriteResGroup102 : SchedWriteRes<[HWPort0,HWPort5]> {
   let Latency = 7;
   let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
-                                     (instregex "VPBROADCAST(B|W)Yrm")>;
-
-// VPGATHERDD.
-// x.
-def WriteVPGATHERDD128 : SchedWriteRes<[]> {
-  let NumMicroOps = 20;
-}
-def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
-
-// y.
-def WriteVPGATHERDD256 : SchedWriteRes<[]> {
-  let NumMicroOps = 34;
-}
-def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
-
-// VPGATHERQD.
-// x.
-def WriteVPGATHERQD128 : SchedWriteRes<[]> {
-  let NumMicroOps = 15;
-}
-def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
-
-// y.
-def WriteVPGATHERQD256 : SchedWriteRes<[]> {
-  let NumMicroOps = 22;
-}
-def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
-
-// VPGATHERDQ.
-// x.
-def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
-  let NumMicroOps = 12;
+  let ResourceCycles = [1,2];
 }
-def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+def: InstRW<[HWWriteResGroup102], (instregex "MPSADBWrri")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VMPSADBWYrri")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VMPSADBWrri")>;
 
-// y.
-def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
-  let NumMicroOps = 20;
-}
-def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
-
-// VPGATHERQQ.
-// x.
-def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
-  let NumMicroOps = 14;
-}
-def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
-
-// y.
-def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
-  let NumMicroOps = 22;
+def HWWriteResGroup103 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
 }
-def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "MPSADBWrmi")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VMPSADBWrmi")>;
 
-//-- Arithmetic instructions --//
-
-////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub  instructions.
-////////////////////////////////////////////////////////////////////////////////
-
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
-  let Latency = 5;
+def HWWriteResGroup104 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
+  let Latency = 9;
   let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
+  let ResourceCycles = [1,1,1];
 }
+def: InstRW<[HWWriteResGroup104], (instregex "DPPDrri")>;
+def: InstRW<[HWWriteResGroup104], (instregex "VDPPDrri")>;
 
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
+def HWWriteResGroup105 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
   let Latency = 9;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 2, 1];
+  let ResourceCycles = [1,1,1,1];
 }
+def: InstRW<[HWWriteResGroup105], (instregex "DPPDrmi")>;
+def: InstRW<[HWWriteResGroup105], (instregex "VDPPDrmi")>;
 
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
-}
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
-                               "MMX_PHADDSWrr64",
-                               "MMX_PHSUB(W|D)rr64",
-                               "MMX_PHSUBSWrr64",
-                               "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
-                               "(V?)PH(ADD|SUB)SWrr(256)?")>;
-
-// v <- v,m.
-def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2, 1];
-}
-def : InstRW<[WritePHADDSUBm, ReadAfterLd],
-                              (instregex "MMX_PHADD(W?)rm64",
-                               "MMX_PHADDSWrm64",
-                               "MMX_PHSUB(W|D)rm64",
-                               "MMX_PHSUBSWrm64",
-                               "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
-                               "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
-
-// PCMPGTQ.
-// v <- v,v.
-def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
-}
-def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
-
-// v <- v,m.
-def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
-
-// PMULLD.
-// x,x / y,y,y.
-def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+def HWWriteResGroup106 : SchedWriteRes<[HWPort0]> {
   let Latency = 10;
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+def: InstRW<[HWWriteResGroup106], (instregex "PMULLDrr")>;
+def: InstRW<[HWWriteResGroup106], (instregex "VPMULLDYrr")>;
+def: InstRW<[HWWriteResGroup106], (instregex "VPMULLDrr")>;
 
-// x,m / y,y,m.
-def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+def HWWriteResGroup107 : SchedWriteRes<[HWPort0,HWPort23]> {
   let Latency = 10;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
-}
-def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
-
-//-- Logic instructions --//
-
-// PTEST.
-// v,v.
-def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
-
-// v,m.
-def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
-
-// PSLL,PSRL,PSRA W/D/Q.
-// x,x / v,v,x.
-def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
-
-// PSLL,PSRL DQ.
-def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
-
-//-- Other --//
-
-// EMMS.
-def WriteEMMS : SchedWriteRes<[]> {
-  let Latency = 13;
-  let NumMicroOps = 31;
-}
-def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
-
-//=== Floating Point XMM and YMM Instructions ===//
-//-- Move instructions --//
-
-// MOVMSKP S/D.
-// r32 <- x.
-def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
-  let Latency = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+def: InstRW<[HWWriteResGroup107], (instregex "PMULLDrm")>;
+def: InstRW<[HWWriteResGroup107], (instregex "VPMULLDYrm")>;
+def: InstRW<[HWWriteResGroup107], (instregex "VPMULLDrm")>;
 
-// r32 <- y.
-def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
-  let Latency = 2;
+def HWWriteResGroup108 : SchedWriteRes<[HWPort0]> {
+  let Latency = 11;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
-
-// VPERM2F128.
-def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
-def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+def: InstRW<[HWWriteResGroup108], (instregex "DIVPSrr")>;
+def: InstRW<[HWWriteResGroup108], (instregex "DIVSSrr")>;
 
-// BLENDVP S/D.
-def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
-def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
-
-// VBROADCASTF128.
-def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
-
-// EXTRACTPS.
-// r32,x,i.
-def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup109 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 11;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup109], (instregex "DIVPSrm")>;
+def: InstRW<[HWWriteResGroup109], (instregex "DIVSSrm")>;
 
-// m32,x,i.
-def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
-  let Latency = 4;
+def HWWriteResGroup110 : SchedWriteRes<[HWPort0]> {
+  let Latency = 11;
   let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
-
-// VEXTRACTF128.
-// x,y,i.
-def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
-
-// m128,y,i.
-def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [3];
 }
-def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
-
-// VINSERTF128.
-// y,y,x,i.
-def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "PCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VPCMPISTRM128rr")>;
 
-// y,y,m128,i.
-def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup111 : SchedWriteRes<[HWPort0,HWPort5]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "PCLMULQDQrr")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VPCLMULQDQrr")>;
 
-// VMASKMOVP S/D.
-// v,v,m.
-def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
-  let Latency = 4;
+def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort015]> {
+  let Latency = 11;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VRCPPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VRSQRTPSYr")>;
 
-// m128,x,x.
-def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
-  let Latency = 13;
+def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 11;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 1, 1];
+  let ResourceCycles = [3,1];
 }
-def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+def: InstRW<[HWWriteResGroup113], (instregex "PCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup113], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[HWWriteResGroup113], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup113], (instregex "VPCMPISTRM128rm")>;
 
-// m256,y,y.
-def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
-  let Latency = 14;
+def HWWriteResGroup114 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+  let Latency = 11;
   let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 1, 1];
-}
-def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
-
-// VGATHERDPS.
-// x.
-def WriteVGATHERDPS128 : SchedWriteRes<[]> {
-  let NumMicroOps = 20;
-}
-def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
-
-// y.
-def WriteVGATHERDPS256 : SchedWriteRes<[]> {
-  let NumMicroOps = 34;
-}
-def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
-
-// VGATHERQPS.
-// x.
-def WriteVGATHERQPS128 : SchedWriteRes<[]> {
-  let NumMicroOps = 15;
-}
-def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
-
-// y.
-def WriteVGATHERQPS256 : SchedWriteRes<[]> {
-  let NumMicroOps = 22;
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+def: InstRW<[HWWriteResGroup114], (instregex "PCLMULQDQrm")>;
+def: InstRW<[HWWriteResGroup114], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[HWWriteResGroup114], (instregex "VRCPPSYm(_Int)?")>;
 
-// VGATHERDPD.
-// x.
-def WriteVGATHERDPD128 : SchedWriteRes<[]> {
-  let NumMicroOps = 12;
-}
-def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
-
-// y.
-def WriteVGATHERDPD256 : SchedWriteRes<[]> {
-  let NumMicroOps = 20;
+def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+def: InstRW<[HWWriteResGroup115], (instregex "VRCPPSm")>;
+def: InstRW<[HWWriteResGroup115], (instregex "VRSQRTPSYm")>;
 
-// VGATHERQPD.
-// x.
-def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+def HWWriteResGroup116 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0,HWPort15,HWPort0156]> {
+  let Latency = 11;
   let NumMicroOps = 14;
+  let ResourceCycles = [1,1,1,4,2,5];
 }
-def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+def: InstRW<[HWWriteResGroup116], (instregex "CMPXCHG8B")>;
 
-// y.
-def WriteVGATHERQPD256 : SchedWriteRes<[]> {
-  let NumMicroOps = 22;
+def HWWriteResGroup117 : SchedWriteRes<[HWPort0]> {
+  let Latency = 13;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
-
-//-- Conversion instructions --//
-
-// CVTPD2PS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
-
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+def: InstRW<[HWWriteResGroup117], (instregex "SQRTPSr")>;
+def: InstRW<[HWWriteResGroup117], (instregex "SQRTSSr")>;
+def: InstRW<[HWWriteResGroup117], (instregex "VDIVPSrr")>;
+def: InstRW<[HWWriteResGroup117], (instregex "VDIVSSrr")>;
 
-// x,y.
-def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 5;
+def HWWriteResGroup118 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 13;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "SQRTPSm")>;
+def: InstRW<[HWWriteResGroup118], (instregex "SQRTSSm")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VDIVPSrm")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VDIVSSrm")>;
 
-// x,m256.
-def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup119 : SchedWriteRes<[HWPort0]> {
+  let Latency = 14;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+def: InstRW<[HWWriteResGroup119], (instregex "DIVPDrr")>;
+def: InstRW<[HWWriteResGroup119], (instregex "DIVSDrr")>;
+def: InstRW<[HWWriteResGroup119], (instregex "VSQRTPSr")>;
+def: InstRW<[HWWriteResGroup119], (instregex "VSQRTSSr")>;
 
-// CVTSD2SS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
-
-// x,m64.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
-
-// CVTPS2PD.
-// x,x.
-def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
+def HWWriteResGroup120 : SchedWriteRes<[HWPort5]> {
+  let Latency = 14;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [2];
 }
-def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup120], (instregex "AESIMCrr")>;
+def: InstRW<[HWWriteResGroup120], (instregex "VAESIMCrr")>;
 
-// x,m64.
-// y,m128.
-def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
-  let Latency = 5;
+def HWWriteResGroup121 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 14;
   let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+def: InstRW<[HWWriteResGroup121], (instregex "DIVPDrm")>;
+def: InstRW<[HWWriteResGroup121], (instregex "DIVSDrm")>;
+def: InstRW<[HWWriteResGroup121], (instregex "VSQRTPSm")>;
+def: InstRW<[HWWriteResGroup121], (instregex "VSQRTSSm")>;
 
-// y,x.
-def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup122 : SchedWriteRes<[HWPort5,HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup122], (instregex "AESIMCrm")>;
+def: InstRW<[HWWriteResGroup122], (instregex "VAESIMCrm")>;
 
-// CVTSS2SD.
-// x,x.
-def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup123 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "DPPSrri")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VDPPSYrri")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VDPPSrri")>;
 
-// x,m32.
-def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2,1,1,1];
 }
-def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup124], (instregex "DPPSrmi")>;
+def: InstRW<[HWWriteResGroup124], (instregex "VDPPSYrmi")>;
+def: InstRW<[HWWriteResGroup124], (instregex "VDPPSrmi")>;
 
-// CVTDQ2PD.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
-
-// y,x.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
-
-// CVT(T)PD2DQ.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
-// x,y.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
-// x,m256.
-def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
-
-// CVT(T)PS2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
-
-// CVTPI2PD.
-// x,mm.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
-
-// CVT(T)PD2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
-
-// CVSTSI2SS.
-// x,r32.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
-
-// CVT(T)SS2SI.
-// r32,x.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
-
-// CVTSI2SD.
-// x,r32/64.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
-
-// CVTSD2SI.
-// r32/64
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
-
-// VCVTPS2PH.
-// x,v,i.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
-// m,v,i.
-def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
-
-// VCVTPH2PS.
-// v,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+def HWWriteResGroup125 : SchedWriteRes<[HWPort23,HWPort0156]> {
+  let Latency = 14;
+  let NumMicroOps = 15;
+  let ResourceCycles = [1,14];
+}
+def: InstRW<[HWWriteResGroup125], (instregex "POPF16")>;
 
-//-- Arithmetic instructions --//
+def HWWriteResGroup126 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort0,HWPort0156]> {
+  let Latency = 15;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup126], (instregex "INSB")>;
+def: InstRW<[HWWriteResGroup126], (instregex "INSL")>;
+def: InstRW<[HWWriteResGroup126], (instregex "INSW")>;
 
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 2];
+def HWWriteResGroup127 : SchedWriteRes<[HWPort5]> {
+  let Latency = 16;
+  let NumMicroOps = 16;
+  let ResourceCycles = [16];
 }
-def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+def: InstRW<[HWWriteResGroup127], (instregex "VZEROALL")>;
 
-// x,m / v,v,m.
-def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1, 2, 1];
+def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort0,HWPort0156]> {
+  let Latency = 16;
+  let NumMicroOps = 19;
+  let ResourceCycles = [2,1,4,1,1,4,6];
 }
-def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup128], (instregex "CMPXCHG16B")>;
 
-// MULL SS/SD PS/PD.
-// x,x / v,v,v.
-def WriteMULr : SchedWriteRes<[HWPort01]> {
-  let Latency = 5;
+def HWWriteResGroup129 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
 }
-def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+def: InstRW<[HWWriteResGroup129], (instregex "PCMPESTRIrr")>;
+def: InstRW<[HWWriteResGroup129], (instregex "VPCMPESTRIrr")>;
 
-// x,m / v,v,m.
-def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup130 : SchedWriteRes<[HWPort5,HWPort6,HWPort0,HWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,1,5];
 }
-def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+def: InstRW<[HWWriteResGroup130], (instregex "CPUID")>;
+def: InstRW<[HWWriteResGroup130], (instregex "RDTSC")>;
 
-// VDIVPS.
-// y,y,y.
-def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 19; // 18-21 cycles.
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup131 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup131], (instregex "PCMPESTRIrm")>;
+def: InstRW<[HWWriteResGroup131], (instregex "VPCMPESTRIrm")>;
 
-// y,y,m256.
-def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 23; // 18-21 + 4 cycles.
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup132 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 19;
+  let ResourceCycles = [3,1,15];
 }
-def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup132], (instregex "XRSTOR")>;
 
-// VDIVPD.
-// y,y,y.
-def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 27; // 19-35 cycles.
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup133 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
 }
-def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "VPCMPESTRM128rr")>;
 
-// y,y,m256.
-def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 31; // 19-35 + 4 cycles.
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
 }
-def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VPCMPESTRM128rm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTPDr")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTSDr")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVPDrr")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVSDrr")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTPDm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTSDm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVPDrm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVSDrm")>;
 
-// VRCPPS.
-// y,y.
-def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup135 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
+  let Latency = 20;
+  let NumMicroOps = 10;
+  let ResourceCycles = [1,2,7];
 }
-def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup135], (instregex "MWAITrr")>;
 
-// y,m256.
-def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> {
+  let Latency = 21;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
 }
-def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPDr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSDr")>;
 
-// ROUND SS/SD PS/PD.
-// v,v,i.
-def WriteROUNDr : SchedWriteRes<[HWPort1]> {
-  let Latency = 6;
+def HWWriteResGroup137 : SchedWriteRes<[HWPort0,HWPort23]> {
+  let Latency = 21;
   let NumMicroOps = 2;
-  let ResourceCycles = [2];
+  let ResourceCycles = [1,1];
 }
-def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup137], (instregex "VSQRTPDm")>;
+def: InstRW<[HWWriteResGroup137], (instregex "VSQRTSDm")>;
 
-// v,m,i.
-def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
-  let Latency = 10;
+def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort015]> {
+  let Latency = 21;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup138], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSYr")>;
 
-// DPPS.
-// x,x,i / v,v,v,i.
-def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
-  let Latency = 14;
+def HWWriteResGroup139 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+  let Latency = 21;
   let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
-}
-def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
-
-// x,m,i / v,v,m,i.
-def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
-  let Latency = 18;
-  let NumMicroOps = 6;
-  let ResourceCycles = [2, 1, 1, 1, 1];
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+def: InstRW<[HWWriteResGroup139], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup139], (instregex "VSQRTPSYm")>;
 
-// DPPD.
-// x,x,i.
-def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
-  let Latency = 9;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup140 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 27;
+  let ResourceCycles = [1,5,1,1,19];
 }
-def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "XSAVE64")>;
 
-// x,m,i.
-def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
-  let Latency = 13;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup141 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 28;
+  let ResourceCycles = [1,6,1,1,19];
 }
-def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+def: InstRW<[HWWriteResGroup141], (instregex "XSAVE")>;
 
-// VFMADD.
-// v,v,v.
-def WriteFMADDr : SchedWriteRes<[HWPort01]> {
-  let Latency = 5;
-  let NumMicroOps = 1;
+def HWWriteResGroup142 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> {
+  let Latency = 28;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,1,1];
 }
-def : InstRW<[WriteFMADDr],
-    (instregex
-    // 3p forms.
-    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
-    // 3s forms.
-    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
-    // 4s/4s_int forms.
-    "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
-    // 4p forms.
-    "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+def: InstRW<[HWWriteResGroup142], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[HWWriteResGroup142], (instregex "VAESKEYGENASSIST128rm")>;
 
-// v,v,m.
-def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteFMADDm],
-    (instregex
-    // 3p forms.
-    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
-    // 3s forms.
-    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
-    // 4s/4s_int forms.
-    "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
-    // 4p forms.
-    "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
-
-//-- Math instructions --//
-
-// VSQRTPS.
-// y,y.
-def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 19;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup143 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> {
+  let Latency = 29;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,7,2];
 }
-def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+def: InstRW<[HWWriteResGroup143], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[HWWriteResGroup143], (instregex "VAESKEYGENASSIST128rr")>;
 
-// y,m256.
-def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 23;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup145 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> {
+  let Latency = 31;
+  let NumMicroOps = 31;
+  let ResourceCycles = [8,1,21,1];
 }
-def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup145], (instregex "MMX_EMMS")>;
 
-// VSQRTPD.
-// y,y.
-def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 28;
+def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort015]> {
+  let Latency = 35;
   let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+  let ResourceCycles = [2,1];
 }
-def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+def: InstRW<[HWWriteResGroup146], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup146], (instregex "VSQRTPDYr")>;
 
-// y,m256.
-def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 32;
+def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+  let Latency = 35;
   let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+  let ResourceCycles = [2,1,1];
 }
-def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+def: InstRW<[HWWriteResGroup147], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup147], (instregex "VSQRTPDYm")>;
 
-// RSQRT SS/PS.
-// x,x.
-def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
-  let Latency = 5;
+def HWWriteResGroup148 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
+  let Latency = 35;
+  let NumMicroOps = 18;
+  let ResourceCycles = [1,1,2,3,1,1,1,8];
 }
-def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup148], (instregex "VMCLEARm")>;
 
-// x,m128.
-def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
-  let Latency = 9;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1, 1];
+def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort0156]> {
+  let Latency = 42;
+  let NumMicroOps = 22;
+  let ResourceCycles = [2,20];
 }
-def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup149], (instregex "RDTSCP")>;
 
-// RSQRTPS 256.
-// y,y.
-def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
-  let Latency = 7;
-  let NumMicroOps = 3;
-  let ResourceCycles = [2, 1];
+def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort0,HWPort0,HWPort015,HWPort0156]> {
+  let Latency = 56;
+  let NumMicroOps = 64;
+  let ResourceCycles = [2,2,8,1,10,2,39];
 }
-def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup150], (instregex "FLDENVm")>;
+def: InstRW<[HWWriteResGroup150], (instregex "FLDENVm")>;
 
-// y,m256.
-def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
-  let Latency = 11;
-  let NumMicroOps = 4;
-  let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup151 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort0,HWPort0,HWPort15,HWPort0156]> {
+  let Latency = 59;
+  let NumMicroOps = 88;
+  let ResourceCycles = [4,4,31,1,2,1,45];
 }
-def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
-
-//-- Logic instructions --//
+def: InstRW<[HWWriteResGroup151], (instregex "FXRSTOR64")>;
 
-// AND, ANDN, OR, XOR PS/PD.
-// x,x / v,v,v.
-def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
-// x,m / v,v,m.
-def : InstRW<[WriteP5Ld, ReadAfterLd],
-                         (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
-
-//-- Other instructions --//
+def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort0,HWPort0,HWPort15,HWPort0156]> {
+  let Latency = 59;
+  let NumMicroOps = 90;
+  let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[HWWriteResGroup152], (instregex "FXRSTOR")>;
 
-// VZEROUPPER.
-def WriteVZEROUPPER : SchedWriteRes<[]> {
-  let NumMicroOps = 4;
+def HWWriteResGroup153 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
+  let Latency = 75;
+  let NumMicroOps = 15;
+  let ResourceCycles = [6,3,6];
 }
-def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+def: InstRW<[HWWriteResGroup153], (instregex "FNINIT")>;
 
-// VZEROALL.
-def WriteVZEROALL : SchedWriteRes<[]> {
-  let NumMicroOps = 12;
+def HWWriteResGroup154 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+  let Latency = 98;
+  let NumMicroOps = 32;
+  let ResourceCycles = [7,7,3,3,1,11];
 }
-def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV64r")>;
 
-// LDMXCSR.
-def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-  let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort0,HWPort0156]> {
+  let Latency = 112;
+  let NumMicroOps = 66;
+  let ResourceCycles = [4,2,4,8,14,34];
 }
-def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+def: InstRW<[HWWriteResGroup155], (instregex "IDIV64r")>;
 
-// STMXCSR.
-def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
-  let Latency = 7;
-  let NumMicroOps = 4;
-  let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup156 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort0,HWPort0156]> {
+  let Latency = 114;
+  let NumMicroOps = 100;
+  let ResourceCycles = [9,9,11,8,1,11,21,30];
 }
-def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+def: InstRW<[HWWriteResGroup156], (instregex "FSTENVm")>;
+def: InstRW<[HWWriteResGroup156], (instregex "FSTENVm")>;
 
 } // SchedModel

Modified: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td (original)
+++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td Tue Jun 27 08:05:13 2017
@@ -24,8 +24,8 @@ def SandyBridgeModel : SchedMachineModel
   // Based on the LSD (loop-stream detector) queue size.
   let LoopMicroOpBufferSize = 28;
 
-  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
-  // the scheduler to assign a default model to unrecognized opcodes.
+  // This flag is set to allow the scheduler to assign
+  // a default model to unrecognized opcodes.
   let CompleteModel = 0;
 }
 
@@ -48,6 +48,7 @@ def SBPort23 : ProcResource<2>;
 def SBPort4 : ProcResource<1>;
 
 // Many micro-ops are capable of issuing on multiple ports.
+def SBPort01  : ProcResGroup<[SBPort0, SBPort1]>;
 def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
 def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
 def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
@@ -157,31 +158,6 @@ def : WriteRes<WriteMPSADLd, [SBPort0, S
   let ResourceCycles = [1, 1, 1, 1];
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub  instructions.
-////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SBPort1]> {
-  let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SBPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
-  let Latency = 5;
-  let ResourceCycles = [1, 1];
-}
-
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [SBPort015]> {
@@ -272,4 +248,2282 @@ def : WriteRes<WriteNop, []>;
 defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
 defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
 defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [SBPort1]> {
+  let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [SBPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 1];
+}
+
+// Remaining SNB instrs.
+
+def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>;
+
+def SBWriteResGroup1 : SchedWriteRes<[SBPort5]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup1], (instregex "ANDNPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "ANDNPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "ANDPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "ANDPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "FDECSTP")>;
+def: InstRW<[SBWriteResGroup1], (instregex "FFREE")>;
+def: InstRW<[SBWriteResGroup1], (instregex "FINCSTP")>;
+def: InstRW<[SBWriteResGroup1], (instregex "FNOP")>;
+def: InstRW<[SBWriteResGroup1], (instregex "INSERTPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "JMP64r")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOV64toPQIrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVAPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVAPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVDDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVLHPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVSDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVSHDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVSLDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVSSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVUPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "MOVUPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "ORPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "ORPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "RETQ")>;
+def: InstRW<[SBWriteResGroup1], (instregex "SHUFPDrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "SHUFPSrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UNPCKHPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UNPCKHPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UNPCKLPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UNPCKLPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDNPDYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDNPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDNPSYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDNPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VANDPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VGATHERQPSZrm")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VINSERTF128rr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VINSERTPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVAPDYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVAPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVAPSYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVAPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVDDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVSSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVUPDYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVUPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVUPSYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VMOVUPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VORPDYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VORPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VORPSYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VORPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPDri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPDrm")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPSri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPSrm")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VPERMILPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VSHUFPDYrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VSHUFPDrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VSHUFPSYrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VSHUFPSrri")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VXORPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "VXORPSrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "XORPDrr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "XORPSrr")>;
+
+def SBWriteResGroup2 : SchedWriteRes<[SBPort01]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup2], (instregex "LEA64_32r")>;
+
+def SBWriteResGroup3 : SchedWriteRes<[SBPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup3], (instregex "BLENDPDrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BLENDPSrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BT32ri8")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BT32rr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTC32ri8")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTC32rr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTR32ri8")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTR32rr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTS32ri8")>;
+def: InstRW<[SBWriteResGroup3], (instregex "BTS32rr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "CDQ")>;
+def: InstRW<[SBWriteResGroup3], (instregex "CQO")>;
+def: InstRW<[SBWriteResGroup3], (instregex "LAHF")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SAHF")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SAR32ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SAR8ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETAEr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETBr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETEr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETGEr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETGr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETLEr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETLr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETNEr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETNOr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETNPr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETNSr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETOr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETPr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SETSr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHL32ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHL64r1")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHL8r1")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHL8ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHR32ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "SHR8ri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VBLENDPDYrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VBLENDPDrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VBLENDPSYrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VBLENDPSrri")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VMOVDQAYrr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VMOVDQArr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VMOVDQUYrr")>;
+def: InstRW<[SBWriteResGroup3], (instregex "VMOVDQUrr")>;
+
+def SBWriteResGroup4 : SchedWriteRes<[SBPort15]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup4], (instregex "KORTESTBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PADDQirr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PABSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PABSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PABSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PACKSSDWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PACKSSWBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PACKUSDWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PACKUSWBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDUSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDUSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PADDWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PALIGNRrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PAVGBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PAVGWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PBLENDWrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPEQBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPEQDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPEQQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPEQWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPGTBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPGTDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PCMPGTWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXUBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXUDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMAXUWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINUBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINUDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMINUWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVSXWQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PMOVZXWQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSHUFBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSHUFDri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSHUFHWri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSHUFLWri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSIGNBrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSIGNDrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSIGNWrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSLLDQri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSRLDQri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBUSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBUSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PSUBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPABSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPABSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPABSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPACKSSDWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPACKSSWBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPACKUSDWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPACKUSWBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPADDBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPADDDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPADDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPADDUSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPADDUSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPALIGNRrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPAVGBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPAVGWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPBLENDWrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPEQBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPEQDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPEQWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPGTBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPGTDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPCMPGTWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXUBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXUDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMAXUWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINSDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINUBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINUDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMINUWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSHUFBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSHUFDri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSHUFLWri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSIGNBrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSIGNDrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSIGNWrr128")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSLLDQri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSRLDQri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBUSBrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBUSWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPSUBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
+
+def SBWriteResGroup5 : SchedWriteRes<[SBPort015]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup5], (instregex "ADD32ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "ADD32rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "ADD8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "ADD8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "AND32ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "AND64ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "AND64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "AND8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "AND8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CBW")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMC")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMP16ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMP32i32")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMP64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMP8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CMP8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "CWDE")>;
+def: InstRW<[SBWriteResGroup5], (instregex "DEC64r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "DEC8r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "INC64r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "INC8r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOV32rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOV8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOV8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVDQArr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVDQUrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVSX32rr16")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVSX32rr8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVZX32rr16")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MOVZX32rr8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "NEG64r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "NEG8r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "NOT64r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "NOT8r")>;
+def: InstRW<[SBWriteResGroup5], (instregex "OR64ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "OR64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "OR8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "OR8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PANDNrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PANDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PORrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PXORrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "STC")>;
+def: InstRW<[SBWriteResGroup5], (instregex "SUB64ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "SUB64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "SUB8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "SUB8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "TEST64rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "TEST8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "TEST8rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPANDNrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPANDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPORrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPXORrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "XOR32rr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "XOR64ri8")>;
+def: InstRW<[SBWriteResGroup5], (instregex "XOR8ri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "XOR8rr")>;
+
+def SBWriteResGroup6 : SchedWriteRes<[SBPort0]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup6], (instregex "MOVMSKPDrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVMSKPSrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVPQIto64rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "PMOVMSKBrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQIto64rr")>;
+
+def SBWriteResGroup8 : SchedWriteRes<[SBPort0]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup8], (instregex "BLENDVPDrr0")>;
+def: InstRW<[SBWriteResGroup8], (instregex "BLENDVPSrr0")>;
+def: InstRW<[SBWriteResGroup8], (instregex "ROL32ri")>;
+def: InstRW<[SBWriteResGroup8], (instregex "ROL8ri")>;
+def: InstRW<[SBWriteResGroup8], (instregex "ROR32ri")>;
+def: InstRW<[SBWriteResGroup8], (instregex "ROR8ri")>;
+def: InstRW<[SBWriteResGroup8], (instregex "SETAr")>;
+def: InstRW<[SBWriteResGroup8], (instregex "SETBEr")>;
+def: InstRW<[SBWriteResGroup8], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[SBWriteResGroup8], (instregex "VBLENDVPDrr")>;
+def: InstRW<[SBWriteResGroup8], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[SBWriteResGroup8], (instregex "VBLENDVPSrr")>;
+
+def SBWriteResGroup9 : SchedWriteRes<[SBPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup9], (instregex "VPBLENDVBrr")>;
+
+def SBWriteResGroup10 : SchedWriteRes<[SBPort015]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup10], (instregex "SCASB")>;
+def: InstRW<[SBWriteResGroup10], (instregex "SCASL")>;
+def: InstRW<[SBWriteResGroup10], (instregex "SCASQ")>;
+def: InstRW<[SBWriteResGroup10], (instregex "SCASW")>;
+
+def SBWriteResGroup11 : SchedWriteRes<[SBPort0,SBPort1]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup11], (instregex "COMISDrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "COMISSrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "UCOMISDrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "UCOMISSrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "VCOMISDrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "VCOMISSrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "VUCOMISDrr")>;
+def: InstRW<[SBWriteResGroup11], (instregex "VUCOMISSrr")>;
+
+def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup12], (instregex "CVTPS2PDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "PTESTrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VPTESTYrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VPTESTrr")>;
+
+def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup13], (instregex "PSLLDrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSLLQrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSLLWrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSRADrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSRAWrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSRLDrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSRLQrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PSRLWrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPSRADrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPSRAWrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPSRLDrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPSRLQrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPSRLWrr")>;
+
+def SBWriteResGroup14 : SchedWriteRes<[SBPort1,SBPort0]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup14], (instregex "BSWAP32r")>;
+
+def SBWriteResGroup15 : SchedWriteRes<[SBPort5,SBPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup15], (instregex "PINSRBrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "PINSRDrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "PINSRQrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "PINSRWrri")>;
+def: InstRW<[SBWriteResGroup15], (instregex "VPINSRBrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "VPINSRDrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "VPINSRQrr")>;
+def: InstRW<[SBWriteResGroup15], (instregex "VPINSRWrri")>;
+
+def SBWriteResGroup16 : SchedWriteRes<[SBPort5,SBPort015]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup16], (instregex "MMX_MOVDQ2Qrr")>;
+
+def SBWriteResGroup17 : SchedWriteRes<[SBPort0,SBPort015]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup17], (instregex "ADC64ri8")>;
+def: InstRW<[SBWriteResGroup17], (instregex "ADC64rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "ADC8ri")>;
+def: InstRW<[SBWriteResGroup17], (instregex "ADC8rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVAE32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVB32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVE32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVG32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVGE32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVL32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVLE32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVNE32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVNO32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVNP32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVNS32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVO32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVP32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "CMOVS32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SBB32rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SBB64ri8")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SBB8ri")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SBB8rr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SHLD32rri8")>;
+def: InstRW<[SBWriteResGroup17], (instregex "SHRD32rri8")>;
+
+def SBWriteResGroup18 : SchedWriteRes<[SBPort0]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup18], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[SBWriteResGroup18], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[SBWriteResGroup18], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMADDUBSWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMADDWDrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULDQrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULHRSWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULHUWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULHWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULLDrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULLWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PMULUDQrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "PSADBWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMADDWDrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMULDQrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMULHRSWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMULHWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMULLDrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPMULLWrr")>;
+def: InstRW<[SBWriteResGroup18], (instregex "VPSADBWrr")>;
+
+def SBWriteResGroup19 : SchedWriteRes<[SBPort1]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup19], (instregex "ADDPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADDPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADDSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADDSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADDSUBPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADDSUBPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "BSF32rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "BSR32rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMPPDrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMPPSrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMPSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMPSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CRC32r32r32")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CRC32r32r8")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CVTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MAXPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MAXPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MAXSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MAXSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MINPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MINPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MINSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MINSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "MUL8r")>;
+def: InstRW<[SBWriteResGroup19], (instregex "POPCNT32rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ROUNDPDr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ROUNDPSr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ROUNDSDr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ROUNDSSr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SUBPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SUBPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SUBSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SUBSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDPDYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDPSYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSUBPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VADDSUBPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VBROADCASTF128")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPPDYrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPPDrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPPSYrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPPSrri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCMPSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXPDYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXPSYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMAXSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMINPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMINPSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMINSDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VMINSSrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VPBROADCASTMB2QZrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VROUNDPDr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VROUNDPSr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VROUNDSDr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VSUBPDrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "VSUBPSrr")>;
+
+def SBWriteResGroup20 : SchedWriteRes<[SBPort0,SBPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup20], (instregex "EXTRACTPSrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VEXTRACTPSrr")>;
+
+def SBWriteResGroup21 : SchedWriteRes<[SBPort0,SBPort15]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup21], (instregex "PEXTRBrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "PEXTRDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "PEXTRQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "PEXTRWri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VPEXTRBrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VPEXTRDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VPEXTRQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VPEXTRWri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SHL64rCL")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SHL8rCL")>;
+
+def SBWriteResGroup22 : SchedWriteRes<[SBPort15]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHADDDrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHADDSWrr128")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHADDWrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHSUBDrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHSUBSWrr128")>;
+def: InstRW<[SBWriteResGroup22], (instregex "PHSUBWrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHADDDrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHADDSWrr128")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHADDWrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHSUBDrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VPHSUBWrr")>;
+
+def SBWriteResGroup23 : SchedWriteRes<[SBPort015]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup23], (instregex "LEAVE64")>;
+def: InstRW<[SBWriteResGroup23], (instregex "XADD32rr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "XADD8rr")>;
+
+def SBWriteResGroup24 : SchedWriteRes<[SBPort0,SBPort015]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup24], (instregex "CMOVA32rr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "CMOVBE32rr")>;
+
+def SBWriteResGroup25 : SchedWriteRes<[SBPort0,SBPort1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup25], (instregex "MUL64r")>;
+
+def SBWriteResGroup26 : SchedWriteRes<[SBPort1,SBPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup26], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTPD2PSrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTSD2SSrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTSI2SD64rr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTSI2SDrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTSI2SD64rr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "VCVTTPD2DQrr")>;
+
+def SBWriteResGroup27 : SchedWriteRes<[SBPort1,SBPort015]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup27], (instregex "MOV64sr")>;
+def: InstRW<[SBWriteResGroup27], (instregex "PAUSE")>;
+
+def SBWriteResGroup28 : SchedWriteRes<[SBPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup28], (instregex "MULPDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MULPSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MULSDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MULSSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "PCMPGTQrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[SBWriteResGroup28], (instregex "RCPPSr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "RCPSSr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "RSQRTPSr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "RSQRTSSr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULPDYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULPDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULPSYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULPSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULSDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VMULSSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VPCMPGTQrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VRSQRTPSr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VRSQRTSSr")>;
+
+def SBWriteResGroup29 : SchedWriteRes<[SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup29], (instregex "MOV32rm")>;
+def: InstRW<[SBWriteResGroup29], (instregex "MOV8rm")>;
+def: InstRW<[SBWriteResGroup29], (instregex "MOVSX32rm16")>;
+def: InstRW<[SBWriteResGroup29], (instregex "MOVSX32rm8")>;
+def: InstRW<[SBWriteResGroup29], (instregex "MOVZX32rm16")>;
+def: InstRW<[SBWriteResGroup29], (instregex "MOVZX32rm8")>;
+def: InstRW<[SBWriteResGroup29], (instregex "PREFETCH")>;
+
+def SBWriteResGroup30 : SchedWriteRes<[SBPort0,SBPort1]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup30], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VCVTTSS2SIrr")>;
+
+def SBWriteResGroup31 : SchedWriteRes<[SBPort4,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup31], (instregex "MOV64mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOV8mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVAPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVAPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVDQAmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVDQUmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVHPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVHPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVLPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVLPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVNTDQmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVNTI_64mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVNTImr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVNTPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVNTPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVPDI2DImr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVPQI2QImr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVPQIto64mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVUPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVUPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "PUSH64i8")>;
+def: InstRW<[SBWriteResGroup31], (instregex "PUSH64r")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVAPDYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVAPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVAPSYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVAPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVDQAYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVDQAmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVDQUYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVDQUmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVHPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVHPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVLPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVLPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTDQmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVNTPSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVSDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVSSmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVUPDYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVUPDmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVUPSYmr")>;
+def: InstRW<[SBWriteResGroup31], (instregex "VMOVUPSmr")>;
+
+def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort15]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup32], (instregex "MPSADBWrri")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VMPSADBWrri")>;
+
+def SBWriteResGroup33 : SchedWriteRes<[SBPort1,SBPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup33], (instregex "CLI")>;
+def: InstRW<[SBWriteResGroup33], (instregex "CVTSI2SS64rr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "CVTSI2SSrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "HADDPDrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "HADDPSrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "HSUBPDrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "HSUBPSrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VCVTSI2SS64rr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHADDPDrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHADDPSYrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHADDPSrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHSUBPDrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VHSUBPSrr")>;
+
+def SBWriteResGroup34 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup34], (instregex "CALL64r")>;
+def: InstRW<[SBWriteResGroup34], (instregex "EXTRACTPSmr")>;
+def: InstRW<[SBWriteResGroup34], (instregex "VEXTRACTPSmr")>;
+
+def SBWriteResGroup35 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup35], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VMASKMOVPSmr")>;
+
+def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup36], (instregex "SETAEm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETBm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETEm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETGEm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETGm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETLEm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETLm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETNEm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETNOm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETNPm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETNSm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETOm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETPm")>;
+def: InstRW<[SBWriteResGroup36], (instregex "SETSm")>;
+
+def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup37], (instregex "PEXTRBmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VPEXTRBmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VPEXTRDmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VPEXTRWmr")>;
+
+def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup38], (instregex "MOV8mi")>;
+def: InstRW<[SBWriteResGroup38], (instregex "STOSB")>;
+def: InstRW<[SBWriteResGroup38], (instregex "STOSL")>;
+def: InstRW<[SBWriteResGroup38], (instregex "STOSQ")>;
+def: InstRW<[SBWriteResGroup38], (instregex "STOSW")>;
+
+def SBWriteResGroup39 : SchedWriteRes<[SBPort5,SBPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup39], (instregex "FNINIT")>;
+
+def SBWriteResGroup40 : SchedWriteRes<[SBPort0,SBPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup40], (instregex "CMPXCHG32rr")>;
+def: InstRW<[SBWriteResGroup40], (instregex "CMPXCHG8rr")>;
+
+def SBWriteResGroup41 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup41], (instregex "SETAm")>;
+def: InstRW<[SBWriteResGroup41], (instregex "SETBEm")>;
+
+def SBWriteResGroup42 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup42], (instregex "LDMXCSR")>;
+def: InstRW<[SBWriteResGroup42], (instregex "STMXCSR")>;
+def: InstRW<[SBWriteResGroup42], (instregex "VLDMXCSR")>;
+def: InstRW<[SBWriteResGroup42], (instregex "VSTMXCSR")>;
+
+def SBWriteResGroup43 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup43], (instregex "PEXTRDmr")>;
+def: InstRW<[SBWriteResGroup43], (instregex "PEXTRQmr")>;
+def: InstRW<[SBWriteResGroup43], (instregex "VPEXTRQmr")>;
+def: InstRW<[SBWriteResGroup43], (instregex "PUSHF16")>;
+def: InstRW<[SBWriteResGroup43], (instregex "PUSHF64")>;
+
+def SBWriteResGroup44 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup44], (instregex "CLFLUSH")>;
+
+def SBWriteResGroup45 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup45], (instregex "FXRSTOR")>;
+
+def SBWriteResGroup46 : SchedWriteRes<[SBPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup46], (instregex "LDDQUrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOV64toPQIrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVAPDrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVAPSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVDDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVDQArm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVDQUrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVNTDQArm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVSHDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVSLDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVSSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVUPDrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "MOVUPSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "POP64r")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VLDDQUYrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VLDDQUrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVAPDrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVAPSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVDDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVDQArm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVDQUrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVNTDQArm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVSDrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVSSrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVUPDrm")>;
+def: InstRW<[SBWriteResGroup46], (instregex "VMOVUPSrm")>;
+
+def SBWriteResGroup47 : SchedWriteRes<[SBPort5,SBPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup47], (instregex "JMP64m")>;
+def: InstRW<[SBWriteResGroup47], (instregex "MOV64sm")>;
+
+def SBWriteResGroup48 : SchedWriteRes<[SBPort23,SBPort0]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup48], (instregex "BT64mi8")>;
+
+def SBWriteResGroup49 : SchedWriteRes<[SBPort23,SBPort15]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MMX_PSIGNWrm64")>;
+
+def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup50], (instregex "ADD64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "ADD8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "AND64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "AND8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP64mi8")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP64mr")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP8mi")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP8mr")>;
+def: InstRW<[SBWriteResGroup50], (instregex "CMP8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "LODSL")>;
+def: InstRW<[SBWriteResGroup50], (instregex "LODSQ")>;
+def: InstRW<[SBWriteResGroup50], (instregex "OR64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "OR8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "SUB64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "SUB8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "XOR64rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "XOR8rm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "POP64rmm")>;
+def: InstRW<[SBWriteResGroup50], (instregex "PUSH64rmm")>;
+
+def SBWriteResGroup51 : SchedWriteRes<[SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup51], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVAPDYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVAPSYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVDQAYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVDQUYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVUPDYrm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "VMOVUPSYrm")>;
+
+def SBWriteResGroup52 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup52], (instregex "CVTPS2PDrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CVTSS2SDrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "VTESTPDrm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "VTESTPSrm")>;
+
+def SBWriteResGroup53 : SchedWriteRes<[SBPort5,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup53], (instregex "ANDNPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ANDNPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ANDPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ANDPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "INSERTPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "MOVHPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "MOVHPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "MOVLPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "MOVLPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ORPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ORPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "SHUFPDrmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "SHUFPSrmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "UNPCKHPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "UNPCKHPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "UNPCKLPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "UNPCKLPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VANDNPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VANDNPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VANDPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VANDPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VBROADCASTF128")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VINSERTPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VMOVHPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VMOVHPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VMOVLPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VMOVLPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VORPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VORPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VPERMILPDmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VPERMILPDri")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VPERMILPSmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VPERMILPSri")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VSHUFPDrmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VSHUFPSrmi")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VXORPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "VXORPSrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "XORPDrm")>;
+def: InstRW<[SBWriteResGroup53], (instregex "XORPSrm")>;
+
+def SBWriteResGroup54 : SchedWriteRes<[SBPort5,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup54], (instregex "AESDECLASTrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "AESDECrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "AESENCLASTrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "AESENCrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "KANDQrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VAESDECLASTrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VAESDECrr")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VAESENCrr")>;
+
+def SBWriteResGroup55 : SchedWriteRes<[SBPort23,SBPort0]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup55], (instregex "BLENDPDrmi")>;
+def: InstRW<[SBWriteResGroup55], (instregex "BLENDPSrmi")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VBLENDPDrmi")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VBLENDPSrmi")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VINSERTF128rm")>;
+
+def SBWriteResGroup56 : SchedWriteRes<[SBPort23,SBPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup56], (instregex "MMX_PADDQirm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PABSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PABSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PABSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PACKSSDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PACKSSWBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PACKUSDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PACKUSWBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDUSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDUSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PADDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PALIGNRrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PAVGBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PAVGWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PBLENDWrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPEQBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPEQDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPEQQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPEQWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPGTBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPGTDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PCMPGTWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PINSRBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PINSRDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PINSRQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PINSRWrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXUBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXUDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMAXUWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINUBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINUDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMINUWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVSXWQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PMOVZXWQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSHUFBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSHUFDmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSHUFHWmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSHUFLWmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSIGNBrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSIGNDrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSIGNWrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBUSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBUSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PSUBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPABSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPABSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPABSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPACKSSDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPACKSSWBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPACKUSDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPACKUSWBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDUSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDUSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPADDWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPALIGNRrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPAVGBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPAVGWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPBLENDWrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPEQBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPEQDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPEQQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPEQWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPGTBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPGTDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPCMPGTWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPINSRBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPINSRDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPINSRQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPINSRWrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXUBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXUDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMAXUWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINSDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINUBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINUDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMINUWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSHUFBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSHUFDmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSHUFHWmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSHUFLWmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSIGNBrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSIGNDrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSIGNWrm128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBUSBrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPSUBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPUNPCKLWDrm")>;
+
+def SBWriteResGroup57 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup57], (instregex "PANDNrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "PANDrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "PORrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "PXORrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VPANDNrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VPANDrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VPORrm")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VPXORrm")>;
+
+def SBWriteResGroup58 : SchedWriteRes<[SBPort0,SBPort0]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup58], (instregex "VRCPPSr")>;
+def: InstRW<[SBWriteResGroup58], (instregex "VRSQRTPSYr")>;
+
+def SBWriteResGroup59 : SchedWriteRes<[SBPort5,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup59], (instregex "VERRm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VERWm")>;
+
+def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup60], (instregex "LODSB")>;
+def: InstRW<[SBWriteResGroup60], (instregex "LODSW")>;
+
+def SBWriteResGroup61 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup61], (instregex "FARJMP64")>;
+
+def SBWriteResGroup62 : SchedWriteRes<[SBPort23,SBPort0,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup62], (instregex "ADC64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "ADC8rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVAE64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVB64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVE64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVG64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVGE64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVL64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVLE64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVNE64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVNO64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVNP64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVNS64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVO64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVP64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "CMOVS64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "SBB64rm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "SBB8rm")>;
+
+def SBWriteResGroup63 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup63], (instregex "FNSTSWm")>;
+
+def SBWriteResGroup64 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup64], (instregex "SLDT32r")>;
+def: InstRW<[SBWriteResGroup64], (instregex "STR32r")>;
+
+def SBWriteResGroup65 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup65], (instregex "CALL64m")>;
+def: InstRW<[SBWriteResGroup65], (instregex "FNSTCW16m")>;
+
+def SBWriteResGroup66 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup66], (instregex "BTC64mi8")>;
+def: InstRW<[SBWriteResGroup66], (instregex "BTR64mi8")>;
+def: InstRW<[SBWriteResGroup66], (instregex "BTS64mi8")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SAR64mi")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SAR8mi")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHL64m1")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHL64mi")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHL8m1")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHL8mi")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHR64mi")>;
+def: InstRW<[SBWriteResGroup66], (instregex "SHR8mi")>;
+
+def SBWriteResGroup67 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup67], (instregex "ADD64mi8")>;
+def: InstRW<[SBWriteResGroup67], (instregex "ADD64mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "ADD8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "ADD8mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "AND64mi8")>;
+def: InstRW<[SBWriteResGroup67], (instregex "AND64mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "AND8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "AND8mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "DEC64m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "DEC8m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "INC64m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "INC8m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "NEG64m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "NEG8m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "NOT64m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "NOT8m")>;
+def: InstRW<[SBWriteResGroup67], (instregex "OR64mi8")>;
+def: InstRW<[SBWriteResGroup67], (instregex "OR64mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "OR8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "OR8mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SUB64mi8")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SUB64mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SUB8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "SUB8mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "TEST64rm")>;
+def: InstRW<[SBWriteResGroup67], (instregex "TEST8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "TEST8rm")>;
+def: InstRW<[SBWriteResGroup67], (instregex "XOR64mi8")>;
+def: InstRW<[SBWriteResGroup67], (instregex "XOR64mr")>;
+def: InstRW<[SBWriteResGroup67], (instregex "XOR8mi")>;
+def: InstRW<[SBWriteResGroup67], (instregex "XOR8mr")>;
+
+def SBWriteResGroup68 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup68], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[SBWriteResGroup68], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[SBWriteResGroup68], (instregex "VTESTPDYrm")>;
+def: InstRW<[SBWriteResGroup68], (instregex "VTESTPSYrm")>;
+
+def SBWriteResGroup69 : SchedWriteRes<[SBPort1,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup69], (instregex "BSF64rm")>;
+def: InstRW<[SBWriteResGroup69], (instregex "BSR64rm")>;
+def: InstRW<[SBWriteResGroup69], (instregex "CRC32r32m16")>;
+def: InstRW<[SBWriteResGroup69], (instregex "CRC32r32m8")>;
+def: InstRW<[SBWriteResGroup69], (instregex "MUL8m")>;
+
+def SBWriteResGroup70 : SchedWriteRes<[SBPort5,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup70], (instregex "VANDNPDYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VANDNPSYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VANDPDrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VANDPSrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VORPDYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VORPSYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VPERM2F128rm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VPERMILPDYri")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VPERMILPDmi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VPERMILPSYri")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VPERMILPSmi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VXORPDrm")>;
+def: InstRW<[SBWriteResGroup70], (instregex "VXORPSrm")>;
+
+def SBWriteResGroup71 : SchedWriteRes<[SBPort23,SBPort0]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup71], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[SBWriteResGroup71], (instregex "VBLENDPSYrmi")>;
+
+def SBWriteResGroup72 : SchedWriteRes<[SBPort23,SBPort0]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup72], (instregex "BLENDVPDrm0")>;
+def: InstRW<[SBWriteResGroup72], (instregex "BLENDVPSrm0")>;
+def: InstRW<[SBWriteResGroup72], (instregex "VBLENDVPDrm")>;
+def: InstRW<[SBWriteResGroup72], (instregex "VBLENDVPSrm")>;
+def: InstRW<[SBWriteResGroup72], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[SBWriteResGroup72], (instregex "VMASKMOVPSrm")>;
+
+def SBWriteResGroup73 : SchedWriteRes<[SBPort23,SBPort15]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup73], (instregex "PBLENDVBrr0")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPBLENDVBrm")>;
+
+def SBWriteResGroup74 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup74], (instregex "COMISDrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "COMISSrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "UCOMISDrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "UCOMISSrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "VCOMISDrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "VCOMISSrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "VUCOMISDrm")>;
+def: InstRW<[SBWriteResGroup74], (instregex "VUCOMISSrm")>;
+
+def SBWriteResGroup75 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup75], (instregex "PTESTrm")>;
+def: InstRW<[SBWriteResGroup75], (instregex "VPTESTrm")>;
+
+def SBWriteResGroup76 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup76], (instregex "PSLLDrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSLLQrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSLLWrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSRADrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSRAWrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSRLDrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSRLQrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "PSRLWrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSLLDri")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSLLQri")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSLLWri")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSRADrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSRAWrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSRLDrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSRLQrm")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPSRLWrm")>;
+
+def SBWriteResGroup77 : SchedWriteRes<[SBPort23,SBPort15]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[SBWriteResGroup77], (instregex "MMX_PHSUBWrm64")>;
+
+def SBWriteResGroup78 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup78], (instregex "CMPXCHG64rm")>;
+def: InstRW<[SBWriteResGroup78], (instregex "CMPXCHG8rm")>;
+
+def SBWriteResGroup79 : SchedWriteRes<[SBPort23,SBPort0,SBPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup79], (instregex "CMOVA64rm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "CMOVBE64rm")>;
+
+def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2,3];
+}
+def: InstRW<[SBWriteResGroup80], (instregex "CMPSB")>;
+def: InstRW<[SBWriteResGroup80], (instregex "CMPSL")>;
+def: InstRW<[SBWriteResGroup80], (instregex "CMPSQ")>;
+def: InstRW<[SBWriteResGroup80], (instregex "CMPSW")>;
+
+def SBWriteResGroup81 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup81], (instregex "FLDCW16m")>;
+
+def SBWriteResGroup82 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup82], (instregex "ROL64mi")>;
+def: InstRW<[SBWriteResGroup82], (instregex "ROL8mi")>;
+def: InstRW<[SBWriteResGroup82], (instregex "ROR64mi")>;
+def: InstRW<[SBWriteResGroup82], (instregex "ROR8mi")>;
+
+def SBWriteResGroup83 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup83], (instregex "MOVSB")>;
+def: InstRW<[SBWriteResGroup83], (instregex "MOVSL")>;
+def: InstRW<[SBWriteResGroup83], (instregex "MOVSQ")>;
+def: InstRW<[SBWriteResGroup83], (instregex "MOVSW")>;
+def: InstRW<[SBWriteResGroup83], (instregex "XADD64rm")>;
+def: InstRW<[SBWriteResGroup83], (instregex "XADD8rm")>;
+
+def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SBWriteResGroup84], (instregex "FARCALL64")>;
+
+def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup85], (instregex "SHLD64mri8")>;
+def: InstRW<[SBWriteResGroup85], (instregex "SHRD64mri8")>;
+
+def SBWriteResGroup86 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup86], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMADDUBSWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMADDWDrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULDQrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULHRSWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULHUWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULHWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULLDrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULLWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PMULUDQrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "PSADBWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMADDWDrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULDQrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULHRSWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULHUWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULHWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULLDrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULLWrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPMULUDQrm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "VPSADBWrm")>;
+
+def SBWriteResGroup87 : SchedWriteRes<[SBPort1,SBPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup87], (instregex "ADDPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ADDPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ADDSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ADDSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ADDSUBPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ADDSUBPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CMPPDrmi")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CMPPSrmi")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CMPSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CVTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CVTSI2SD64rm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CVTSI2SDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MAXPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MAXPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MAXSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MAXSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MINPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MINPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MINSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MINSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "POPCNT64rm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ROUNDPDm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ROUNDPSm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ROUNDSDm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "ROUNDSSm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "SUBPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "SUBPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "SUBSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "SUBSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDSUBPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VADDSUBPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCMPPDrmi")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCMPPSrmi")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCMPSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCMPSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCVTSI2SD64rm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCVTSI2SDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMAXPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMAXPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMAXSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMAXSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMINPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMINPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMINSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VMINSSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VROUNDPDm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VROUNDPSm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VROUNDSDm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VROUNDSSm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VSUBPDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VSUBPSrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VSUBSDrm")>;
+def: InstRW<[SBWriteResGroup87], (instregex "VSUBSSrm")>;
+
+def SBWriteResGroup88 : SchedWriteRes<[SBPort23,SBPort0]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup88], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[SBWriteResGroup88], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[SBWriteResGroup88], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[SBWriteResGroup88], (instregex "VMASKMOVPSrm")>;
+
+def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup89], (instregex "DPPDrri")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VDPPDrri")>;
+
+def SBWriteResGroup90 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MUL64m")>;
+
+def SBWriteResGroup91 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup91], (instregex "VPTESTYrm")>;
+
+def SBWriteResGroup92 : SchedWriteRes<[SBPort23,SBPort15]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup92], (instregex "PHADDDrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "PHADDSWrm128")>;
+def: InstRW<[SBWriteResGroup92], (instregex "PHADDWrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "PHSUBDrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "PHSUBSWrm128")>;
+def: InstRW<[SBWriteResGroup92], (instregex "PHSUBWrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHADDDrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHADDSWrm128")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHADDWrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHSUBDrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VPHSUBWrm")>;
+def: InstRW<[SBWriteResGroup92], (instregex "SHL64mCL")>;
+def: InstRW<[SBWriteResGroup92], (instregex "SHL8mCL")>;
+
+def SBWriteResGroup93 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,2,3];
+}
+def: InstRW<[SBWriteResGroup93], (instregex "ADC64mi8")>;
+def: InstRW<[SBWriteResGroup93], (instregex "ADC8mi")>;
+def: InstRW<[SBWriteResGroup93], (instregex "SBB64mi8")>;
+def: InstRW<[SBWriteResGroup93], (instregex "SBB8mi")>;
+
+def SBWriteResGroup94 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,2,2,1];
+}
+def: InstRW<[SBWriteResGroup94], (instregex "ADC64mr")>;
+def: InstRW<[SBWriteResGroup94], (instregex "ADC8mr")>;
+def: InstRW<[SBWriteResGroup94], (instregex "SBB64mr")>;
+def: InstRW<[SBWriteResGroup94], (instregex "SBB8mr")>;
+
+def SBWriteResGroup95 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort0,SBPort015]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup95], (instregex "BT64mr")>;
+def: InstRW<[SBWriteResGroup95], (instregex "BTC64mr")>;
+def: InstRW<[SBWriteResGroup95], (instregex "BTR64mr")>;
+def: InstRW<[SBWriteResGroup95], (instregex "BTS64mr")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VADDPDYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VADDPSYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VCMPPDYrmi")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VCMPPSYrmi")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VMAXPDYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VMAXPSYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VMINPDrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VMINPSrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VROUNDPDm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VROUNDPSm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup95], (instregex "VSUBPSYrm")>;
+
+def SBWriteResGroup96 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VCVTTSS2SIrm")>;
+
+def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup97], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTPD2PSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTSD2SSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTSI2SS64rm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTSI2SSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTDQ2PDYrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTPD2PSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTSI2SS64rm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTSI2SSrm")>;
+def: InstRW<[SBWriteResGroup97], (instregex "VCVTTPD2DQrm")>;
+
+def SBWriteResGroup98 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup98], (instregex "MULPDrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "MULPSrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "MULSDrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "MULSSrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "PCMPGTQrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[SBWriteResGroup98], (instregex "RCPPSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "RCPSSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "RSQRTPSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "RSQRTSSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VMULPDrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VMULPSrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VMULSDrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VMULSSrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VPCMPGTQrm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VRCPPSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VRCPSSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VRSQRTPSm")>;
+def: InstRW<[SBWriteResGroup98], (instregex "VRSQRTSSm")>;
+
+def SBWriteResGroup99 : SchedWriteRes<[SBPort0]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup99], (instregex "PCMPISTRIrr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "VPCMPISTRM128rr")>;
+
+def SBWriteResGroup100 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup100], (instregex "VCVTPD2DQYrm")>;
+def: InstRW<[SBWriteResGroup100], (instregex "VCVTPD2PSYrm")>;
+def: InstRW<[SBWriteResGroup100], (instregex "VCVTTPD2DQYrm")>;
+
+def SBWriteResGroup101 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup101], (instregex "MPSADBWrmi")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VMPSADBWrmi")>;
+
+def SBWriteResGroup102 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup102], (instregex "HADDPDrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "HADDPSrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "HSUBPDrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "HSUBPSrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VHADDPDrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VHADDPSrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VHSUBPDrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VHSUBPSrm")>;
+
+def SBWriteResGroup103 : SchedWriteRes<[SBPort5]> {
+  let Latency = 12;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup103], (instregex "AESIMCrr")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VAESIMCrr")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VMULPDYrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VMULPSYrm")>;
+
+def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup104], (instregex "DPPSrri")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VDPPSYrri")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VDPPSrri")>;
+
+def SBWriteResGroup105 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup105], (instregex "VHADDPDrm")>;
+def: InstRW<[SBWriteResGroup105], (instregex "VHADDPSYrm")>;
+def: InstRW<[SBWriteResGroup105], (instregex "VHSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup105], (instregex "VHSUBPSYrm")>;
+
+def SBWriteResGroup106 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup106], (instregex "AESDECLASTrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "AESDECrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "AESENCLASTrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "AESENCrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "VAESDECLASTrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "VAESDECrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "VAESENCLASTrm")>;
+def: InstRW<[SBWriteResGroup106], (instregex "VAESENCrm")>;
+
+def SBWriteResGroup107 : SchedWriteRes<[SBPort0]> {
+  let Latency = 14;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup107], (instregex "DIVPSrr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "DIVSSrr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "SQRTPSr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "SQRTSSr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "VDIVPSrr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "VDIVSSrr")>;
+def: InstRW<[SBWriteResGroup107], (instregex "VSQRTPSr")>;
+
+def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup108], (instregex "VSQRTSSm")>;
+
+def SBWriteResGroup109 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup109], (instregex "VRCPPSm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "VRSQRTPSYm")>;
+
+def SBWriteResGroup110 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> {
+  let Latency = 15;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup110], (instregex "DPPDrmi")>;
+def: InstRW<[SBWriteResGroup110], (instregex "VDPPDrmi")>;
+
+def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 17;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+def: InstRW<[SBWriteResGroup111], (instregex "PCMPISTRIrm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "VPCMPISTRM128rm")>;
+
+def SBWriteResGroup112 : SchedWriteRes<[SBPort5,SBPort23]> {
+  let Latency = 18;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup112], (instregex "AESIMCrm")>;
+def: InstRW<[SBWriteResGroup112], (instregex "VAESIMCrm")>;
+
+def SBWriteResGroup113 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 20;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup113], (instregex "DIVPSrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "DIVSSrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "SQRTPSm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "SQRTSSm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VDIVPSrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VDIVSSrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VSQRTPSm")>;
+
+def SBWriteResGroup114 : SchedWriteRes<[SBPort0]> {
+  let Latency = 21;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup114], (instregex "VSQRTSDr")>;
+
+def SBWriteResGroup115 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 21;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup115], (instregex "VSQRTSDm")>;
+
+def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> {
+  let Latency = 22;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup116], (instregex "DIVPDrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "DIVSDrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "SQRTPDr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "SQRTSDr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VDIVPDrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VDIVSDrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPDr")>;
+
+def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> {
+  let Latency = 28;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup117], (instregex "DIVPDrm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "DIVSDrm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "SQRTPDm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "SQRTSDm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "VDIVPDrm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "VDIVSDrm")>;
+def: InstRW<[SBWriteResGroup117], (instregex "VSQRTPDm")>;
+
+def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort0]> {
+  let Latency = 29;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup118], (instregex "VDIVPSYrr")>;
+def: InstRW<[SBWriteResGroup118], (instregex "VSQRTPSYr")>;
+
+def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> {
+  let Latency = 36;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup119], (instregex "VDIVPSYrm")>;
+def: InstRW<[SBWriteResGroup119], (instregex "VSQRTPSYm")>;
+
+def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort0]> {
+  let Latency = 45;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup120], (instregex "VDIVPDYrr")>;
+def: InstRW<[SBWriteResGroup120], (instregex "VSQRTPDYr")>;
+
+def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> {
+  let Latency = 52;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup121], (instregex "VDIVPDYrm")>;
+def: InstRW<[SBWriteResGroup121], (instregex "VSQRTPDYm")>;
+
+def SBWriteResGroup122 : SchedWriteRes<[SBPort0]> {
+  let Latency = 114;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup122], (instregex "VSQRTSSr")>;
+
 } // SchedModel

Modified: llvm/trunk/test/CodeGen/X86/avx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-schedule.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll Tue Jun 27 08:05:13 2017
@@ -10,14 +10,14 @@ define <4 x double> @test_addpd(<4 x dou
 ; SANDY-LABEL: test_addpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # BB#0:
@@ -40,14 +40,14 @@ define <8 x float> @test_addps(<8 x floa
 ; SANDY-LABEL: test_addps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # BB#0:
@@ -70,14 +70,14 @@ define <4 x double> @test_addsubpd(<4 x
 ; SANDY-LABEL: test_addsubpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # BB#0:
@@ -101,14 +101,14 @@ define <8 x float> @test_addsubps(<8 x f
 ; SANDY-LABEL: test_addsubps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # BB#0:
@@ -131,17 +131,17 @@ declare <8 x float> @llvm.x86.avx.addsub
 define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: test_andnotpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andnotpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # BB#0:
@@ -172,17 +172,17 @@ define <4 x double> @test_andnotpd(<4 x
 define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_andnotps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andnotps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # BB#0:
@@ -216,14 +216,14 @@ define <4 x double> @test_andpd(<4 x dou
 ; SANDY-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SANDY-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # BB#0:
@@ -255,14 +255,14 @@ define <8 x float> @test_andps(<8 x floa
 ; SANDY-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SANDY-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # BB#0:
@@ -291,17 +291,17 @@ define <8 x float> @test_andps(<8 x floa
 define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: test_blendpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; SANDY-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_blendpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # BB#0:
@@ -326,15 +326,15 @@ define <4 x double> @test_blendpd(<4 x d
 define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_blendps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
+; SANDY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_blendps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
-; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # BB#0:
@@ -356,15 +356,15 @@ define <8 x float> @test_blendps(<8 x fl
 define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
 ; SANDY-LABEL: test_blendvpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_blendvpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # BB#0:
@@ -387,15 +387,15 @@ declare <4 x double> @llvm.x86.avx.blend
 define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
 ; SANDY-LABEL: test_blendvps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_blendvps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # BB#0:
@@ -418,13 +418,13 @@ declare <8 x float> @llvm.x86.avx.blendv
 define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ; SANDY-LABEL: test_broadcastf128:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [3:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_broadcastf128:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastf128:
 ; BTVER2:       # BB#0:
@@ -443,13 +443,13 @@ define <8 x float> @test_broadcastf128(<
 define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ; SANDY-LABEL: test_broadcastsd_ymm:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_broadcastsd_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastsd_ymm:
 ; BTVER2:       # BB#0:
@@ -469,13 +469,13 @@ define <4 x double> @test_broadcastsd_ym
 define <4 x float> @test_broadcastss(float *%a0) {
 ; SANDY-LABEL: test_broadcastss:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_broadcastss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss:
 ; BTVER2:       # BB#0:
@@ -496,12 +496,12 @@ define <8 x float> @test_broadcastss_ymm
 ; SANDY-LABEL: test_broadcastss_ymm:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_broadcastss_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss_ymm:
 ; BTVER2:       # BB#0:
@@ -521,17 +521,17 @@ define <8 x float> @test_broadcastss_ymm
 define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: test_cmppd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT:    vorpd %ymm2, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmppd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # BB#0:
@@ -559,17 +559,17 @@ define <4 x double> @test_cmppd(<4 x dou
 define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_cmpps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT:    vorps %ymm2, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # BB#0:
@@ -598,16 +598,16 @@ define <4 x double> @test_cvtdq2pd(<4 x
 ; SANDY-LABEL: test_cvtdq2pd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtdq2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
-; HASWELL-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # BB#0:
@@ -632,19 +632,19 @@ define <4 x double> @test_cvtdq2pd(<4 x
 define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ; SANDY-LABEL: test_cvtdq2ps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT:    vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtdq2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # BB#0:
@@ -669,17 +669,17 @@ define <8 x float> @test_cvtdq2ps(<8 x i
 define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_cvtpd2dq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
-; HASWELL-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # BB#0:
@@ -704,17 +704,17 @@ define <8 x i32> @test_cvtpd2dq(<4 x dou
 define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_cvtpd2ps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtpd2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # BB#0:
@@ -741,15 +741,15 @@ define <8 x i32> @test_cvtps2dq(<8 x flo
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; SANDY-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # BB#0:
@@ -774,15 +774,15 @@ define <8 x i32> @test_cvtps2dq(<8 x flo
 define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: test_divpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:3.00]
+; SANDY-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:3.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
-; HASWELL-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # BB#0:
@@ -804,15 +804,15 @@ define <4 x double> @test_divpd(<4 x dou
 define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_divps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [36:3.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # BB#0:
@@ -834,15 +834,15 @@ define <8 x float> @test_divps(<8 x floa
 define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_dpps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
 ; SANDY-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_dpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
-; HASWELL-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # BB#0:
@@ -866,16 +866,16 @@ define <4 x float> @test_extractf128(<8
 ; SANDY-LABEL: test_extractf128:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_extractf128:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_extractf128:
 ; BTVER2:       # BB#0:
@@ -900,13 +900,13 @@ define <4 x double> @test_haddpd(<4 x do
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_haddpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
 ; HASWELL-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # BB#0:
@@ -929,15 +929,15 @@ declare <4 x double> @llvm.x86.avx.hadd.
 define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_haddps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_haddps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # BB#0:
@@ -960,15 +960,15 @@ declare <8 x float> @llvm.x86.avx.hadd.p
 define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: test_hsubpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_hsubpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # BB#0:
@@ -991,15 +991,15 @@ declare <4 x double> @llvm.x86.avx.hsub.
 define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_hsubps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_hsubps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # BB#0:
@@ -1023,16 +1023,16 @@ define <8 x float> @test_insertf128(<8 x
 ; SANDY-LABEL: test_insertf128:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; SANDY-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_insertf128:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_insertf128:
 ; BTVER2:       # BB#0:
@@ -1059,13 +1059,13 @@ define <8 x float> @test_insertf128(<8 x
 define <32 x i8> @test_lddqu(i8* %a0) {
 ; SANDY-LABEL: test_lddqu:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_lddqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vlddqu (%rdi), %ymm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # BB#0:
@@ -1084,17 +1084,17 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.2
 define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ; SANDY-LABEL: test_maskmovpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maskmovpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd:
 ; BTVER2:       # BB#0:
@@ -1119,29 +1119,29 @@ declare void @llvm.x86.avx.maskstore.pd(
 define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
 ; SANDY-LABEL: test_maskmovpd_ymm:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [5:1.00]
+; SANDY-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
 ; SANDY-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maskmovpd_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:1.00]
+; HASWELL-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
 ; HASWELL-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
 ; BTVER2-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_maskmovpd_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
 ; ZNVER1-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.50]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
@@ -1154,17 +1154,17 @@ declare void @llvm.x86.avx.maskstore.pd.
 define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ; SANDY-LABEL: test_maskmovps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
 ; SANDY-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maskmovps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps:
 ; BTVER2:       # BB#0:
@@ -1189,29 +1189,29 @@ declare void @llvm.x86.avx.maskstore.ps(
 define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
 ; SANDY-LABEL: test_maskmovps_ymm:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
+; SANDY-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; SANDY-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maskmovps_ymm:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
+; HASWELL-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps_ymm:
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; BTVER2-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_maskmovps_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; ZNVER1-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.50]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
@@ -1225,14 +1225,14 @@ define <4 x double> @test_maxpd(<4 x dou
 ; SANDY-LABEL: test_maxpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # BB#0:
@@ -1256,14 +1256,14 @@ define <8 x float> @test_maxps(<8 x floa
 ; SANDY-LABEL: test_maxps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # BB#0:
@@ -1288,13 +1288,13 @@ define <4 x double> @test_minpd(<4 x dou
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # BB#0:
@@ -1319,13 +1319,13 @@ define <8 x float> @test_minps(<8 x floa
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # BB#0:
@@ -1348,17 +1348,17 @@ declare <8 x float> @llvm.x86.avx.min.ps
 define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_movapd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovapd (%rdi), %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movapd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovapd (%rdi), %ymm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovapd %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # BB#0:
@@ -1382,17 +1382,17 @@ define <4 x double> @test_movapd(<4 x do
 define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ; SANDY-LABEL: test_movaps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovaps (%rdi), %ymm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movaps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovaps (%rdi), %ymm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovaps %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # BB#0:
@@ -1417,16 +1417,16 @@ define <4 x double> @test_movddup(<4 x d
 ; SANDY-LABEL: test_movddup:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movddup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # BB#0:
@@ -1451,15 +1451,15 @@ define <4 x double> @test_movddup(<4 x d
 define i32 @test_movmskpd(<4 x double> %a0) {
 ; SANDY-LABEL: test_movmskpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovmskpd %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # BB#0:
@@ -1479,15 +1479,15 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(
 define i32 @test_movmskps(<8 x float> %a0) {
 ; SANDY-LABEL: test_movmskps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovmskps %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovmskps %ymm0, %eax # sched: [5:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # BB#0:
@@ -1508,14 +1508,14 @@ define <4 x double> @test_movntpd(<4 x d
 ; SANDY-LABEL: test_movntpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # BB#0:
@@ -1537,14 +1537,14 @@ define <8 x float> @test_movntps(<8 x fl
 ; SANDY-LABEL: test_movntps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntps %ymm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # BB#0:
@@ -1566,16 +1566,16 @@ define <8 x float> @test_movshdup(<8 x f
 ; SANDY-LABEL: test_movshdup:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; SANDY-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movshdup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # BB#0:
@@ -1601,16 +1601,16 @@ define <8 x float> @test_movsldup(<8 x f
 ; SANDY-LABEL: test_movsldup:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; SANDY-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movsldup:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # BB#0:
@@ -1635,19 +1635,19 @@ define <8 x float> @test_movsldup(<8 x f
 define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_movupd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movupd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovupd (%rdi), %ymm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovupd %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # BB#0:
@@ -1671,19 +1671,19 @@ define <4 x double> @test_movupd(<4 x do
 define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ; SANDY-LABEL: test_movups:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movups:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovups (%rdi), %ymm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovups %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # BB#0:
@@ -1708,14 +1708,14 @@ define <4 x double> @test_mulpd(<4 x dou
 ; SANDY-LABEL: test_mulpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # BB#0:
@@ -1738,14 +1738,14 @@ define <8 x float> @test_mulps(<8 x floa
 ; SANDY-LABEL: test_mulps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # BB#0:
@@ -1767,17 +1767,17 @@ define <8 x float> @test_mulps(<8 x floa
 define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
 ; SANDY-LABEL: orpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: orpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: orpd:
 ; BTVER2:       # BB#0:
@@ -1806,17 +1806,17 @@ define <4 x double> @orpd(<4 x double> %
 define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
 ; SANDY-LABEL: test_orps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_orps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # BB#0:
@@ -1846,16 +1846,16 @@ define <2 x double> @test_permilpd(<2 x
 ; SANDY-LABEL: test_permilpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; SANDY-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd:
 ; BTVER2:       # BB#0:
@@ -1880,17 +1880,17 @@ define <2 x double> @test_permilpd(<2 x
 define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_permilpd_ymm:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SANDY-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [8:1.00]
 ; SANDY-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
 ; HASWELL-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd_ymm:
 ; BTVER2:       # BB#0:
@@ -1916,16 +1916,16 @@ define <4 x float> @test_permilps(<4 x f
 ; SANDY-LABEL: test_permilps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; SANDY-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilps:
 ; BTVER2:       # BB#0:
@@ -1950,17 +1950,17 @@ define <4 x float> @test_permilps(<4 x f
 define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ; SANDY-LABEL: test_permilps_ymm:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SANDY-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [8:1.00]
 ; SANDY-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
 ; HASWELL-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilps_ymm:
 ; BTVER2:       # BB#0:
@@ -1986,14 +1986,14 @@ define <2 x double> @test_permilvarpd(<2
 ; SANDY-LABEL: test_permilvarpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilvarpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd:
 ; BTVER2:       # BB#0:
@@ -2018,13 +2018,13 @@ define <4 x double> @test_permilvarpd_ym
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
 ; SANDY-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilvarpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd_ymm:
 ; BTVER2:       # BB#0:
@@ -2048,14 +2048,14 @@ define <4 x float> @test_permilvarps(<4
 ; SANDY-LABEL: test_permilvarps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilvarps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps:
 ; BTVER2:       # BB#0:
@@ -2080,13 +2080,13 @@ define <8 x float> @test_permilvarps_ymm
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
 ; SANDY-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_permilvarps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps_ymm:
 ; BTVER2:       # BB#0:
@@ -2112,14 +2112,14 @@ define <8 x float> @test_rcpps(<8 x floa
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vrcpps (%rdi), %ymm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rcpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # BB#0:
@@ -2148,14 +2148,14 @@ define <4 x double> @test_roundpd(<4 x d
 ; SANDY-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_roundpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # BB#0:
@@ -2184,14 +2184,14 @@ define <8 x float> @test_roundps(<8 x fl
 ; SANDY-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_roundps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # BB#0:
@@ -2217,17 +2217,17 @@ declare <8 x float> @llvm.x86.avx.round.
 define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SANDY-LABEL: test_rsqrtps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [14:3.00]
+; SANDY-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [7:3.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rsqrtps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # BB#0:
@@ -2254,16 +2254,16 @@ define <4 x double> @test_shufpd(<4 x do
 ; SANDY-LABEL: test_shufpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_shufpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # BB#0:
@@ -2289,14 +2289,14 @@ define <8 x float> @test_shufps(<8 x flo
 ; SANDY-LABEL: test_shufps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; SANDY-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_shufps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # BB#0:
@@ -2318,17 +2318,17 @@ define <8 x float> @test_shufps(<8 x flo
 define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SANDY-LABEL: test_sqrtpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [52:3.00]
+; SANDY-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [45:3.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
-; HASWELL-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [35:2.00]
+; HASWELL-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [35:2.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # BB#0:
@@ -2354,17 +2354,17 @@ declare <4 x double> @llvm.x86.avx.sqrt.
 define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SANDY-LABEL: test_sqrtps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [36:3.00]
+; SANDY-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [29:3.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
-; HASWELL-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [21:2.00]
+; HASWELL-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
 ; HASWELL-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # BB#0:
@@ -2391,14 +2391,14 @@ define <4 x double> @test_subpd(<4 x dou
 ; SANDY-LABEL: test_subpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # BB#0:
@@ -2421,14 +2421,14 @@ define <8 x float> @test_subps(<8 x floa
 ; SANDY-LABEL: test_subps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # BB#0:
@@ -2451,20 +2451,20 @@ define i32 @test_testpd(<2 x double> %a0
 ; SANDY-LABEL: test_testpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    setb %al # sched: [1:0.33]
-; SANDY-NEXT:    vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    setb %al # sched: [1:1.00]
+; SANDY-NEXT:    vtestpd (%rdi), %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestpd (%rdi), %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    setb %al # sched: [1:1.00]
+; HASWELL-NEXT:    vtestpd (%rdi), %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testpd:
 ; BTVER2:       # BB#0:
@@ -2495,22 +2495,22 @@ define i32 @test_testpd_ymm(<4 x double>
 ; SANDY-LABEL: test_testpd_ymm:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    setb %al # sched: [1:0.33]
-; SANDY-NEXT:    vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    setb %al # sched: [1:1.00]
+; SANDY-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [1:0.33]
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testpd_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    setb %al # sched: [1:1.00]
+; HASWELL-NEXT:    vtestpd (%rdi), %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testpd_ymm:
 ; BTVER2:       # BB#0:
@@ -2542,20 +2542,20 @@ define i32 @test_testps(<4 x float> %a0,
 ; SANDY-LABEL: test_testps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    setb %al # sched: [1:0.33]
-; SANDY-NEXT:    vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    setb %al # sched: [1:1.00]
+; SANDY-NEXT:    vtestps (%rdi), %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestps (%rdi), %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    setb %al # sched: [1:1.00]
+; HASWELL-NEXT:    vtestps (%rdi), %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testps:
 ; BTVER2:       # BB#0:
@@ -2586,22 +2586,22 @@ define i32 @test_testps_ymm(<8 x float>
 ; SANDY-LABEL: test_testps_ymm:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT:    setb %al # sched: [1:0.33]
-; SANDY-NEXT:    vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT:    setb %al # sched: [1:1.00]
+; SANDY-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
 ; SANDY-NEXT:    adcl $0, %eax # sched: [1:0.33]
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_testps_ymm:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    setb %al # sched: [1:0.50]
-; HASWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    setb %al # sched: [1:1.00]
+; HASWELL-NEXT:    vtestps (%rdi), %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_testps_ymm:
 ; BTVER2:       # BB#0:
@@ -2635,14 +2635,14 @@ define <4 x double> @test_unpckhpd(<4 x
 ; SANDY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SANDY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpckhpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # BB#0:
@@ -2669,13 +2669,13 @@ define <8 x float> @test_unpckhps(<8 x f
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SANDY-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpckhps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; HASWELL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # BB#0:
@@ -2698,16 +2698,16 @@ define <4 x double> @test_unpcklpd(<4 x
 ; SANDY-LABEL: test_unpcklpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SANDY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
 ; SANDY-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpcklpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # BB#0:
@@ -2733,14 +2733,14 @@ define <8 x float> @test_unpcklps(<8 x f
 ; SANDY-LABEL: test_unpcklps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SANDY-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpcklps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # BB#0:
@@ -2765,14 +2765,14 @@ define <4 x double> @test_xorpd(<4 x dou
 ; SANDY-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SANDY-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; SANDY-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_xorpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # BB#0:
@@ -2804,14 +2804,14 @@ define <8 x float> @test_xorps(<8 x floa
 ; SANDY-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SANDY-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_xorps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # BB#0:
@@ -2841,12 +2841,12 @@ define void @test_zeroall() {
 ; SANDY-LABEL: test_zeroall:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vzeroall # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_zeroall:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vzeroall # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroall # sched: [16:16.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_zeroall:
 ; BTVER2:       # BB#0:
@@ -2866,12 +2866,12 @@ define void @test_zeroupper() {
 ; SANDY-LABEL: test_zeroupper:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_zeroupper:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_zeroupper:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll Tue Jun 27 08:05:13 2017
@@ -1619,10 +1619,10 @@ define <8 x float>  @test_gather_mask(<8
 ;
 ; AVX512VL-LABEL: test_gather_mask:
 ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; AVX512VL-NEXT:    vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88]
 ; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; AVX512VL-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
+; AVX512VL-NEXT:    vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
 ; AVX512VL-NEXT:    vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %a_i8 = bitcast float* %a to i8*

Modified: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-schedule.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll Tue Jun 27 08:05:13 2017
@@ -9,7 +9,7 @@ define <32 x i8> @test_pabsb(<32 x i8> %
 ; HASWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [5:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsb:
 ; ZNVER1:       # BB#0:
@@ -29,9 +29,9 @@ define <8 x i32> @test_pabsd(<8 x i32> %
 ; HASWELL-LABEL: test_pabsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsd:
 ; ZNVER1:       # BB#0:
@@ -51,9 +51,9 @@ define <16 x i16> @test_pabsw(<16 x i16>
 ; HASWELL-LABEL: test_pabsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pabsw:
 ; ZNVER1:       # BB#0:
@@ -74,7 +74,7 @@ define <32 x i8> @test_paddb(<32 x i8> %
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddb:
 ; ZNVER1:       # BB#0:
@@ -92,7 +92,7 @@ define <8 x i32> @test_paddd(<8 x i32> %
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddd:
 ; ZNVER1:       # BB#0:
@@ -109,8 +109,8 @@ define <4 x i64> @test_paddq(<4 x i64> %
 ; HASWELL-LABEL: test_paddq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddq:
 ; ZNVER1:       # BB#0:
@@ -128,7 +128,7 @@ define <16 x i16> @test_paddw(<16 x i16>
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_paddw:
 ; ZNVER1:       # BB#0:
@@ -145,9 +145,9 @@ define <4 x i64> @test_pand(<4 x i64> %a
 ; HASWELL-LABEL: test_pand:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pand:
 ; ZNVER1:       # BB#0:
@@ -166,9 +166,9 @@ define <4 x i64> @test_pandn(<4 x i64> %
 ; HASWELL-LABEL: test_pandn:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pandn:
 ; ZNVER1:       # BB#0:
@@ -190,7 +190,7 @@ define <8 x i32> @test_pmulld(<8 x i32>
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
 ; HASWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pmulld:
 ; ZNVER1:       # BB#0:
@@ -207,8 +207,8 @@ define <16 x i16> @test_pmullw(<16 x i16
 ; HASWELL-LABEL: test_pmullw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pmullw:
 ; ZNVER1:       # BB#0:
@@ -225,9 +225,9 @@ define <4 x i64> @test_por(<4 x i64> %a0
 ; HASWELL-LABEL: test_por:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_por:
 ; ZNVER1:       # BB#0:
@@ -246,8 +246,8 @@ define <32 x i8> @test_psubb(<32 x i8> %
 ; HASWELL-LABEL: test_psubb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubb:
 ; ZNVER1:       # BB#0:
@@ -264,8 +264,8 @@ define <8 x i32> @test_psubd(<8 x i32> %
 ; HASWELL-LABEL: test_psubd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubd:
 ; ZNVER1:       # BB#0:
@@ -282,8 +282,8 @@ define <4 x i64> @test_psubq(<4 x i64> %
 ; HASWELL-LABEL: test_psubq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubq:
 ; ZNVER1:       # BB#0:
@@ -300,8 +300,8 @@ define <16 x i16> @test_psubw(<16 x i16>
 ; HASWELL-LABEL: test_psubw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_psubw:
 ; ZNVER1:       # BB#0:
@@ -318,9 +318,9 @@ define <4 x i64> @test_pxor(<4 x i64> %a
 ; HASWELL-LABEL: test_pxor:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; ZNVER1-LABEL: test_pxor:
 ; ZNVER1:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll Tue Jun 27 08:05:13 2017
@@ -381,6 +381,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4
 ; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: srl_trunc_and_v4i64:
 ; X64:       ## BB#0:
@@ -391,6 +392,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4
 ; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
   %trunc = trunc <4 x i64> %and to <4 x i32>
   %sra = lshr <4 x i32> %x, %trunc
@@ -412,6 +414,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r,
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: shl_8i16:
 ; X64:       ## BB#0:
@@ -423,6 +426,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r,
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %shl = shl <8 x i16> %r, %a
   ret <8 x i16> %shl
 }
@@ -434,13 +438,14 @@ define <16 x i16> @shl_16i16(<16 x i16>
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X32-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
-; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: shl_16i16:
 ; X64:       ## BB#0:
@@ -448,13 +453,14 @@ define <16 x i16> @shl_16i16(<16 x i16>
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X64-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
-; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %shl = shl <16 x i16> %r, %a
   ret <16 x i16> %shl
 }
@@ -474,6 +480,7 @@ define <32 x i8> @shl_32i8(<32 x i8> %r,
 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: shl_32i8:
 ; X64:       ## BB#0:
@@ -489,6 +496,7 @@ define <32 x i8> @shl_32i8(<32 x i8> %r,
 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %shl = shl <32 x i8> %r, %a
   ret <32 x i8> %shl
 }
@@ -504,6 +512,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: ashr_8i16:
 ; X64:       ## BB#0:
@@ -515,6 +524,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %ashr = ashr <8 x i16> %r, %a
   ret <8 x i16> %ashr
 }
@@ -526,13 +536,14 @@ define <16 x i16> @ashr_16i16(<16 x i16>
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X32-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
-; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: ashr_16i16:
 ; X64:       ## BB#0:
@@ -540,13 +551,14 @@ define <16 x i16> @ashr_16i16(<16 x i16>
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X64-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
-; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %ashr = ashr <16 x i16> %r, %a
   ret <16 x i16> %ashr
 }
@@ -579,6 +591,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r
 ; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; X32-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: ashr_32i8:
 ; X64:       ## BB#0:
@@ -607,6 +620,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r
 ; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; X64-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %ashr = ashr <32 x i8> %r, %a
   ret <32 x i8> %ashr
 }
@@ -622,6 +636,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: lshr_8i16:
 ; X64:       ## BB#0:
@@ -633,6 +648,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %lshr = lshr <8 x i16> %r, %a
   ret <8 x i16> %lshr
 }
@@ -644,13 +660,14 @@ define <16 x i16> @lshr_16i16(<16 x i16>
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X32-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
-; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: lshr_16i16:
 ; X64:       ## BB#0:
@@ -658,13 +675,14 @@ define <16 x i16> @lshr_16i16(<16 x i16>
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
 ; X64-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
-; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpsrld $16, %ymm3, %ymm1
 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %lshr = lshr <16 x i16> %r, %a
   ret <16 x i16> %lshr
 }
@@ -685,6 +703,7 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r
 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    retl
+; X32-NEXT:    ## -- End function
 ;
 ; X64-LABEL: lshr_32i8:
 ; X64:       ## BB#0:
@@ -701,6 +720,7 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r
 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   %lshr = lshr <32 x i8> %r, %a
   ret <32 x i8> %lshr
 }

Modified: llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cmp.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cmp.ll Tue Jun 27 08:05:13 2017
@@ -14,6 +14,7 @@ define double @test1(double %a, double %
 ; ALL-NEXT:  LBB0_2: ## %l2
 ; ALL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; ALL-NEXT:    retq
+; ALL-NEXT:    ## -- End function
   %tobool = fcmp une double %a, %b
   br i1 %tobool, label %l1, label %l2
 
@@ -36,6 +37,7 @@ define float @test2(float %a, float %b)
 ; ALL-NEXT:  LBB1_2: ## %l2
 ; ALL-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; ALL-NEXT:    retq
+; ALL-NEXT:    ## -- End function
   %tobool = fcmp olt float %a, %b
   br i1 %tobool, label %l1, label %l2
 
@@ -124,11 +126,11 @@ entry:
 define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
 ; ALL-LABEL: test8:
 ; ALL:       ## BB#0:
-; ALL-NEXT:    notl %edi
 ; ALL-NEXT:    xorl $-2147483648, %esi ## imm = 0x80000000
 ; ALL-NEXT:    testl %edx, %edx
 ; ALL-NEXT:    movl $1, %eax
 ; ALL-NEXT:    cmovel %eax, %edx
+; ALL-NEXT:    notl %edi
 ; ALL-NEXT:    orl %edi, %esi
 ; ALL-NEXT:    cmovnel %edx, %eax
 ; ALL-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Tue Jun 27 08:05:13 2017
@@ -1545,19 +1545,19 @@ define <4 x double> @uitofp_4i1_double(<
 }
 
 define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_float:
-; NOVL:       # BB#0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpextrb $8, %xmm0, %eax
-; NOVL-NEXT:    andl $1, %eax
-; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; NOVL-NEXT:    vpextrb $0, %xmm0, %eax
-; NOVL-NEXT:    andl $1, %eax
-; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; NOVL-NEXT:    retq
+; KNL-LABEL: uitofp_2i1_float:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpextrb $8, %xmm0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; KNL-NEXT:    andl $1, %ecx
+; KNL-NEXT:    vcvtsi2ssl %ecx, %xmm2, %xmm1
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_2i1_float:
 ; VL:       # BB#0:
@@ -1567,6 +1567,34 @@ define <2 x float> @uitofp_2i1_float(<2
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitofp_2i1_float:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512DQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: uitofp_2i1_float:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512BW-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Tue Jun 27 08:05:13 2017
@@ -12,6 +12,7 @@ define <16 x float> @test1(<16 x float>
 ; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; KNL-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test1:
 ; SKX:       ## BB#0:
@@ -21,6 +22,7 @@ define <16 x float> @test1(<16 x float>
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; SKX-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %rrr = load float, float* %br
   %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
   %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -36,6 +38,7 @@ define <8 x double> @test2(<8 x double>
 ; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; KNL-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test2:
 ; SKX:       ## BB#0:
@@ -45,6 +48,7 @@ define <8 x double> @test2(<8 x double>
 ; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SKX-NEXT:    vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
   %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -58,6 +62,7 @@ define <16 x float> @test3(<16 x float>
 ; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
 ; KNL-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test3:
 ; SKX:       ## BB#0:
@@ -65,6 +70,7 @@ define <16 x float> @test3(<16 x float>
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
 ; SKX-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %eee = extractelement <16 x float> %x, i32 4
   %rrr2 = insertelement <16 x float> %x, float %eee, i32 1
   ret <16 x float> %rrr2
@@ -78,6 +84,7 @@ define <8 x i64> @test4(<8 x i64> %x) no
 ; KNL-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
 ; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test4:
 ; SKX:       ## BB#0:
@@ -86,6 +93,7 @@ define <8 x i64> @test4(<8 x i64> %x) no
 ; SKX-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
 ; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %eee = extractelement <8 x i64> %x, i32 4
   %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
   ret <8 x i64> %rrr2
@@ -96,11 +104,13 @@ define i32 @test5(<4 x float> %x) nounwi
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vextractps $3, %xmm0, %eax
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test5:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vextractps $3, %xmm0, %eax
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %ef = extractelement <4 x float> %x, i32 3
   %ei = bitcast float %ef to i32
   ret i32 %ei
@@ -111,11 +121,13 @@ define void @test6(<4 x float> %x, float
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vextractps $3, %xmm0, (%rdi)
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test6:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vextractps $3, %xmm0, (%rdi)
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %ef = extractelement <4 x float> %x, i32 3
   store float %ef, float* %out, align 4
   ret void
@@ -135,6 +147,7 @@ define float @test7(<16 x float> %x, i32
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## BB#0:
@@ -150,6 +163,7 @@ define float @test7(<16 x float> %x, i32
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %e = extractelement <16 x float> %x, i32 %ind
   ret float %e
 }
@@ -168,6 +182,7 @@ define double @test8(<8 x double> %x, i3
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## BB#0:
@@ -183,6 +198,7 @@ define double @test8(<8 x double> %x, i3
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %e = extractelement <8 x double> %x, i32 %ind
   ret double %e
 }
@@ -201,6 +217,7 @@ define float @test9(<8 x float> %x, i32
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
@@ -216,6 +233,7 @@ define float @test9(<8 x float> %x, i32
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %e = extractelement <8 x float> %x, i32 %ind
   ret float %e
 }
@@ -234,6 +252,7 @@ define i32 @test10(<16 x i32> %x, i32 %i
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
@@ -249,6 +268,7 @@ define i32 @test10(<16 x i32> %x, i32 %i
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %e = extractelement <16 x i32> %x, i32 %ind
   ret i32 %e
 }
@@ -1114,137 +1134,137 @@ define i32 @test_insertelement_v32i1(i32
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
-; KNL-NEXT:    xorl %eax, %eax
-; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
-; KNL-NEXT:    vmovd %edx, %xmm1
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
-; KNL-NEXT:    vmovd %edx, %xmm0
-; KNL-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    vmovd %eax, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -1299,8 +1319,8 @@ define i8 @test_iinsertelement_v4i1(i32
 ; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    vpextrb $4, %xmm0, %ecx
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
@@ -2124,8 +2144,8 @@ define i16 @test_extractelement_variable
 define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
 ; KNL-LABEL: test_extractelement_variable_v16i8:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $15, %edi
 ; KNL-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
@@ -2156,8 +2176,8 @@ define i8 @test_extractelement_variable_
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $31, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
@@ -2204,9 +2224,9 @@ define i8 @test_extractelement_variable_
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-64, %rsp
 ; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $63, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
@@ -2295,12 +2315,12 @@ define i8 @test_extractelement_variable_
 define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v2i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,8), %eax
 ; KNL-NEXT:    andl $1, %eax
@@ -2325,12 +2345,12 @@ define zeroext i8 @test_extractelement_v
 define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v4i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $3, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; KNL-NEXT:    andl $1, %eax

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll Tue Jun 27 08:05:13 2017
@@ -2880,7 +2880,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextractf32x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k1
 ; CHECK-NEXT:    kshiftrw $15, %k1, %k1
@@ -2898,6 +2897,7 @@ define <4 x float> @test_mask_vextractf3
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
 ; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -2941,7 +2941,6 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: test_maskz_vextracti32x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k1
 ; CHECK-NEXT:    kshiftrw $15, %k1, %k1
@@ -2959,6 +2958,7 @@ define <4 x i32> @test_maskz_vextracti32
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
 ; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Tue Jun 27 08:05:13 2017
@@ -1837,73 +1837,8 @@ define void @ktest_2(<32 x float> %in, f
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
-; KNL-NEXT:    vmovups (%rdi), %zmm2
-; KNL-NEXT:    vmovups 64(%rdi), %zmm3
-; KNL-NEXT:    vcmpltps %zmm1, %zmm3, %k1
-; KNL-NEXT:    kshiftlw $14, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    kshiftlw $15, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm3
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $13, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $12, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $11, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $10, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $9, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $8, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $7, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $6, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $5, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $4, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $3, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $2, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftlw $1, %k1, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT:    kshiftrw $15, %k1, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT:    vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT:    vmovups 64(%rdi), %zmm2
+; KNL-NEXT:    vcmpltps %zmm1, %zmm2, %k2
 ; KNL-NEXT:    kshiftlw $14, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1967,138 +1902,203 @@ define void @ktest_2(<32 x float> %in, f
 ; KNL-NEXT:    kshiftrw $15, %k2, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT:    vmovups 4(%rdi), %zmm3 {%k2} {z}
-; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT:    vmovups (%rdi), %zmm3
+; KNL-NEXT:    vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm3
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kshiftrw $15, %k1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}
 ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kshiftlw $14, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $15, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %ecx
 ; KNL-NEXT:    vmovd %ecx, %xmm4
 ; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $13, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $13, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $12, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $12, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $11, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $11, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $10, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $10, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $9, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $9, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $8, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $8, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $7, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $7, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $6, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $6, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $5, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $5, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $4, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $3, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $3, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $2, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $2, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $1, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $1, %k0, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT:    vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm3
+; KNL-NEXT:    vmovups 4(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT:    vcmpltps %zmm4, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm3
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vmovd %ecx, %xmm4
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; KNL-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
@@ -2943,36 +2943,6 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:  Lcfi9:
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    pushq %r15
-; KNL-NEXT:  Lcfi10:
-; KNL-NEXT:    .cfi_def_cfa_offset 24
-; KNL-NEXT:    pushq %r14
-; KNL-NEXT:  Lcfi11:
-; KNL-NEXT:    .cfi_def_cfa_offset 32
-; KNL-NEXT:    pushq %r13
-; KNL-NEXT:  Lcfi12:
-; KNL-NEXT:    .cfi_def_cfa_offset 40
-; KNL-NEXT:    pushq %r12
-; KNL-NEXT:  Lcfi13:
-; KNL-NEXT:    .cfi_def_cfa_offset 48
-; KNL-NEXT:    pushq %rbx
-; KNL-NEXT:  Lcfi14:
-; KNL-NEXT:    .cfi_def_cfa_offset 56
-; KNL-NEXT:  Lcfi15:
-; KNL-NEXT:    .cfi_offset %rbx, -56
-; KNL-NEXT:  Lcfi16:
-; KNL-NEXT:    .cfi_offset %r12, -48
-; KNL-NEXT:  Lcfi17:
-; KNL-NEXT:    .cfi_offset %r13, -40
-; KNL-NEXT:  Lcfi18:
-; KNL-NEXT:    .cfi_offset %r14, -32
-; KNL-NEXT:  Lcfi19:
-; KNL-NEXT:    .cfi_offset %r15, -24
-; KNL-NEXT:  Lcfi20:
-; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -2984,281 +2954,275 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    vmovd %ecx, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %r9d, %xmm3
-; KNL-NEXT:    kmovw %k1, %r9d
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $8, %ebx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $9, %ebp, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $14, %r9d, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
-; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT:    kmovw %k0, 6(%rdi)
-; KNL-NEXT:    kshiftlw $14, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r8d
-; KNL-NEXT:    kshiftlw $15, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r10d
-; KNL-NEXT:    kshiftlw $13, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r9d
-; KNL-NEXT:    kshiftlw $12, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r11d
-; KNL-NEXT:    kshiftlw $11, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r14d
-; KNL-NEXT:    kshiftlw $10, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r15d
-; KNL-NEXT:    kshiftlw $9, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r12d
-; KNL-NEXT:    kshiftlw $8, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r13d
-; KNL-NEXT:    kshiftlw $7, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    kshiftlw $6, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %esi
-; KNL-NEXT:    kshiftlw $5, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebp
-; KNL-NEXT:    kshiftlw $4, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebx
-; KNL-NEXT:    kshiftlw $3, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    kshiftlw $2, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    kshiftlw $1, %k2, %k0
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r10d, %xmm2
-; KNL-NEXT:    kmovw %k0, %r10d
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT:    kshiftrw $15, %k2, %k0
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm2, %xmm1
-; KNL-NEXT:    vpinsrb $2, %r9d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm2
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT:    kmovw %k0, 4(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r8d
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r10d
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r9d
+; KNL-NEXT:    vmovd %eax, %xmm3
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    vpinsrb $1, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r11d
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r14d
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r15d
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r12d
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    vpinsrb $5, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %r13d
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    vpinsrb $7, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebp
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ebx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %r10d, %xmm1
-; KNL-NEXT:    kmovw %k0, %r10d
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm0
-; KNL-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    kmovw %k1, 2(%rdi)
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm3, %xmm2
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm1
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, 6(%rdi)
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    kmovw %k1, 4(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r8d
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r10d
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r15d
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r12d
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebp
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ebx
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %r9d, %xmm0
-; KNL-NEXT:    kmovw %k1, %r9d
-; KNL-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $2, %r10d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm0
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, 2(%rdi)
+; KNL-NEXT:    kshiftlw $14, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $13, %k1, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vmovd %ecx, %xmm0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $12, %k1, %k0
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $11, %k1, %k0
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $10, %k1, %k0
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $9, %k1, %k0
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $8, %k1, %k0
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $7, %k1, %k0
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $6, %k1, %k0
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $5, %k1, %k0
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $4, %k1, %k0
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $3, %k1, %k0
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftlw $2, %k1, %k0
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $1, %k1, %k0
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kshiftrw $15, %k1, %k0
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
-; KNL-NEXT:    popq %rbx
-; KNL-NEXT:    popq %r12
-; KNL-NEXT:    popq %r13
-; KNL-NEXT:    popq %r14
-; KNL-NEXT:    popq %r15
-; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_64i1:

Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Tue Jun 27 08:05:13 2017
@@ -8,6 +8,7 @@ define <16 x float> @test1(<16 x float>
 ; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = fcmp ole <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
   ret <16 x float> %max
@@ -19,6 +20,7 @@ define <8 x double> @test2(<8 x double>
 ; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = fcmp ole <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
   ret <8 x double> %max
@@ -30,6 +32,7 @@ define <16 x i32> @test3(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %yp, align 4
   %mask = icmp eq <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -42,6 +45,7 @@ define <16 x i32> @test4_unsigned(<16 x
 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = icmp uge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
@@ -53,6 +57,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = icmp eq <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
@@ -64,6 +69,7 @@ define <8 x i64> @test6_unsigned(<8 x i6
 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = icmp ugt <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
   ret <8 x i64> %max
@@ -117,12 +123,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8
 ; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %mask = icmp eq <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
@@ -137,12 +145,14 @@ define <8 x float> @test10(<8 x float> %
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %mask = fcmp oeq <8 x float> %x, %y
   %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -154,6 +164,7 @@ define <8 x i32> @test11_unsigned(<8 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = icmp ugt <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
@@ -168,6 +179,7 @@ define i16 @test12(<16 x i64> %a, <16 x
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## BB#0:
@@ -178,6 +190,7 @@ define i16 @test12(<16 x i64> %a, <16 x
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %res = icmp eq <16 x i64> %a, %b
   %res1 = bitcast <16 x i1> %res to i16
   ret i16 %res1
@@ -330,6 +343,7 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12_v32i32:
 ; SKX:       ## BB#0:
@@ -339,6 +353,7 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %res = icmp eq <32 x i32> %a, %b
   %res1 = bitcast <32 x i1> %res to i32
   ret i32 %res1
@@ -562,72 +577,72 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm1
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm0
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vmovd %ecx, %xmm1
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm0
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -642,6 +657,7 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12_v64i16:
 ; SKX:       ## BB#0:
@@ -651,6 +667,7 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; SKX-NEXT:    kmovq %k0, %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %res = icmp eq <64 x i16> %a, %b
   %res1 = bitcast <64 x i1> %res to i64
   ret i64 %res1
@@ -704,6 +721,7 @@ define <16 x i32> @test16(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
 ; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask = icmp sge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
@@ -715,6 +733,7 @@ define <16 x i32> @test17(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sgt <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -727,6 +746,7 @@ define <16 x i32> @test18(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sle <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -739,6 +759,7 @@ define <16 x i32> @test19(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp ule <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -752,6 +773,7 @@ define <16 x i32> @test20(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp eq <16 x i32> %x1, %y1
   %mask0 = icmp eq <16 x i32> %x, %y
   %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -766,6 +788,7 @@ define <8 x i64> @test21(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %mask0 = icmp sle <8 x i64> %x, %y
   %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -780,6 +803,7 @@ define <8 x i64> @test22(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sgt <8 x i64> %x1, %y1
   %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
   %mask0 = icmp sgt <8 x i64> %x, %y
@@ -795,6 +819,7 @@ define <16 x i32> @test23(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask0 = icmp ule <16 x i32> %x, %y
@@ -809,6 +834,7 @@ define <8 x i64> @test24(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
   %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -823,6 +849,7 @@ define <16 x i32> @test25(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
   %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -838,6 +865,7 @@ define <16 x i32> @test26(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -855,6 +883,7 @@ define <8 x i64> @test27(<8 x i64> %x, i
 ; CHECK-NEXT:    vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -920,12 +949,14 @@ define <4 x double> @test30(<4 x double>
 ; KNL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test30:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %mask = fcmp oeq <4 x double> %x, %y
   %max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -938,12 +969,14 @@ define <2 x double> @test31(<2 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi), %xmm0, %xmm2
 ; KNL-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test31:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %xmm0, %k1
 ; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %y = load <2 x double>, <2 x double>* %yp, align 4
   %mask = fcmp olt <2 x double> %x, %y
@@ -957,12 +990,14 @@ define <4 x double> @test32(<4 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi), %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %y = load <4 x double>, <4 x double>* %yp, align 4
   %mask = fcmp ogt <4 x double> %y, %x
@@ -976,6 +1011,7 @@ define <8 x double> @test33(<8 x double>
 ; CHECK-NEXT:    vcmpltpd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <8 x double>, <8 x double>* %yp, align 4
   %mask = fcmp olt <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -988,12 +1024,14 @@ define <4 x float> @test34(<4 x float> %
 ; KNL-NEXT:    vcmpltps (%rdi), %xmm0, %xmm2
 ; KNL-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test34:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %xmm0, %k1
 ; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
   %y = load <4 x float>, <4 x float>* %yp, align 4
   %mask = fcmp olt <4 x float> %x, %y
   %max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1010,12 +1048,14 @@ define <8 x float> @test35(<8 x float> %
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test35:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %y = load <8 x float>, <8 x float>* %yp, align 4
   %mask = fcmp ogt <8 x float> %y, %x
@@ -1029,6 +1069,7 @@ define <16 x float> @test36(<16 x float>
 ; CHECK-NEXT:    vcmpltps (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %y = load <16 x float>, <16 x float>* %yp, align 4
   %mask = fcmp olt <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1041,6 +1082,7 @@ define <8 x double> @test37(<8 x double>
 ; CHECK-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <8 x double> undef, double %a, i32 0
@@ -1058,12 +1100,14 @@ define <4 x double> @test38(<4 x double>
 ; KNL-NEXT:    vcmpltpd %ymm2, %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test38:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to4}, %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <4 x double> undef, double %a, i32 0
@@ -1081,12 +1125,14 @@ define <2 x double> @test39(<2 x double>
 ; KNL-NEXT:    vcmpltpd %xmm2, %xmm0, %xmm2
 ; KNL-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test39:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to2}, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <2 x double> undef, double %a, i32 0
@@ -1104,6 +1150,7 @@ define <16  x float> @test40(<16  x floa
 ; CHECK-NEXT:    vcmpltps (%rdi){1to16}, %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <16  x float> undef, float %a, i32 0
@@ -1124,12 +1171,14 @@ define <8  x float> @test41(<8  x float>
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test41:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to8}, %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <8  x float> undef, float %a, i32 0
@@ -1147,12 +1196,14 @@ define <4  x float> @test42(<4  x float>
 ; KNL-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
 ; KNL-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test42:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to4}, %xmm0, %k1
 ; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <4  x float> undef, float %a, i32 0
@@ -1172,6 +1223,7 @@ define <8 x double> @test43(<8 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; KNL-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    retq
+; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test43:
 ; SKX:       ## BB#0:
@@ -1180,6 +1232,7 @@ define <8 x double> @test43(<8 x double>
 ; SKX-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; SKX-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
+; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <8 x double> undef, double %a, i32 0

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Tue Jun 27 08:05:13 2017
@@ -1685,8 +1685,6 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:  .Lcfi9:
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -1707,39 +1705,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -1748,8 +1746,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
@@ -1758,8 +1756,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -1767,8 +1765,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -1777,8 +1775,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -1789,8 +1787,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -1798,8 +1796,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1809,8 +1807,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1820,8 +1818,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1831,8 +1829,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1842,8 +1840,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1852,8 +1850,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1864,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %bl
@@ -1877,8 +1875,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
@@ -1887,8 +1885,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1898,8 +1896,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1910,8 +1908,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1921,8 +1919,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -1932,8 +1930,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1942,8 +1940,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -1952,444 +1950,444 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm6, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    movl %ecx, %esi
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %esi, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %esi, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $7, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
 ; AVX512F-32-NEXT:    shrl $12, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $13, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
 ; AVX512F-32-NEXT:    shrl $14, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
 ; AVX512F-32-NEXT:    shrl $15, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %ebx
 ; AVX512F-32-NEXT:    shrl $16, %ebx
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    andb $15, %al
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    shrb $2, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
 ; AVX512F-32-NEXT:    shrb $7, %bl
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
@@ -2397,12 +2395,12 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT:    vpcmpleb %zmm0, %zmm1, %k5 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2571,8 +2569,6 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:  .Lcfi15:
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -2593,39 +2589,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -2634,8 +2630,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
@@ -2644,8 +2640,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -2653,8 +2649,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -2663,8 +2659,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -2675,8 +2671,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -2684,8 +2680,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2695,8 +2691,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2706,8 +2702,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2717,8 +2713,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2728,8 +2724,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2738,8 +2734,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2750,8 +2746,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %bl
@@ -2763,8 +2759,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
@@ -2773,8 +2769,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2784,8 +2780,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2796,8 +2792,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2807,8 +2803,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -2818,8 +2814,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2828,8 +2824,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
@@ -2838,444 +2834,444 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm6, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    movl %ecx, %esi
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %esi, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %esi, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %cl, %al
 ; AVX512F-32-NEXT:    shrb $7, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $61440, %eax # imm = 0xF000
 ; AVX512F-32-NEXT:    shrl $12, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $13, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $49152, %eax # imm = 0xC000
 ; AVX512F-32-NEXT:    shrl $14, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andl $32768, %eax # imm = 0x8000
 ; AVX512F-32-NEXT:    shrl $15, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %ebx
 ; AVX512F-32-NEXT:    shrl $16, %ebx
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    andb $15, %al
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    shrb $2, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $5, %al
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %bl, %al
 ; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
 ; AVX512F-32-NEXT:    shrb $7, %bl
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-32-NEXT:    movb %al, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
 ; AVX512F-32-NEXT:    movb %dl, %al
 ; AVX512F-32-NEXT:    shrb $2, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
@@ -3283,12 +3279,12 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k5 {%k1}
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll Tue Jun 27 08:05:13 2017
@@ -2695,32 +2695,32 @@ declare <32 x i8> @llvm.x86.avx512.mask.
 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK-LABEL: test_cmp_b_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; CHECK-NEXT:    kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
-; CHECK-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
 ; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02]
 ; CHECK-NEXT:    kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; CHECK-NEXT:    vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; CHECK-NEXT:    vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
-; CHECK-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; CHECK-NEXT:    ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
-; CHECK-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; CHECK-NEXT:    ## xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; CHECK-NEXT:    vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; CHECK-NEXT:    vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0]
+; CHECK-NEXT:    ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
+; CHECK-NEXT:    vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -2750,23 +2750,23 @@ define <8 x i32> @test_mask_cmp_b_256(<3
 ; CHECK-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
 ; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
 ; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2793,32 +2793,32 @@ declare i32 @llvm.x86.avx512.mask.cmp.b.
 define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK-LABEL: test_ucmp_b_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; CHECK-NEXT:    kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
-; CHECK-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
 ; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
-; CHECK-NEXT:    kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02]
 ; CHECK-NEXT:    kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; CHECK-NEXT:    vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; CHECK-NEXT:    vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
-; CHECK-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; CHECK-NEXT:    ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
-; CHECK-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; CHECK-NEXT:    ## xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; CHECK-NEXT:    vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; CHECK-NEXT:    vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0]
+; CHECK-NEXT:    ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
+; CHECK-NEXT:    vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1]
+; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -2848,23 +2848,23 @@ define <8 x i32> @test_mask_ucmp_b_256(<
 ; CHECK-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
 ; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
 ; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)

Modified: llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll Tue Jun 27 08:05:13 2017
@@ -453,10 +453,10 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8
 ; SSE2-SSSE3-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE2-SSSE3-NEXT:    pcmpgtb %xmm3, %xmm1
 ; SSE2-SSSE3-NEXT:    pcmpgtb %xmm6, %xmm4
-; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm4
 ; SSE2-SSSE3-NEXT:    pcmpgtb %xmm7, %xmm5
 ; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm5
 ; SSE2-SSSE3-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm4
 ; SSE2-SSSE3-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
 ; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-SSSE3-NEXT:    andb $1, %al

Modified: llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-legalization-store-ordering.ll Tue Jun 27 08:05:13 2017
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s
 
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
@@ -6,31 +7,32 @@ target datalayout = "e-m:o-p:32:32-f64:3
 ; into loads, off the stack or a previous store.
 ; Be very explicit about the ordering/stack offsets.
 
-; CHECK-LABEL: test_extractelement_legalization_storereuse:
-; CHECK:      # BB#0
-; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movl 16(%esp), %eax
-; CHECK-NEXT: movl 24(%esp), %ecx
-; CHECK-NEXT: movl 20(%esp), %edx
-; CHECK-NEXT: paddd (%edx), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT:	movl	(%edx), %esi
-; CHECK-NEXT:	movl	4(%edx), %edi
-; CHECK-NEXT:	shll	$4, %ecx
-; CHECK-NEXT:	movl	8(%edx), %ebx
-; CHECK-NEXT:	movl	12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: popl %ebx
-; CHECK-NEXT: retl
 
 define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
+; CHECK-LABEL:   _test_extractelement_legalization_storereuse: ## @test_extractelement_legalization_storereuse 
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    paddd (%ecx), %xmm0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movdqa %xmm0, (%ecx)
+; CHECK-NEXT:    movl (%ecx), %esi
+; CHECK-NEXT:    movl 4(%ecx), %edi
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    movl 8(%ecx), %ebx
+; CHECK-NEXT:    movl 12(%ecx), %ecx
+; CHECK-NEXT:    movl %esi, 12(%eax,%edx)
+; CHECK-NEXT:    movl %edi, (%eax,%edx)
+; CHECK-NEXT:    movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT:    movl %ecx, 4(%eax,%edx)
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:    ## -- End function
 entry:
   %0 = bitcast i32* %y to <4 x i32>*
   %1 = load <4 x i32>, <4 x i32>* %0, align 16

Modified: llvm/trunk/test/CodeGen/X86/fp128-i128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp128-i128.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fp128-i128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fp128-i128.ll Tue Jun 27 08:05:13 2017
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64
 ; CHECK-NEXT:    andq %rdi, %rcx
 ; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
 ; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    jmp foo # TAILCALL

Modified: llvm/trunk/test/CodeGen/X86/gather-addresses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/gather-addresses.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/gather-addresses.ll (original)
+++ llvm/trunk/test/CodeGen/X86/gather-addresses.ll Tue Jun 27 08:05:13 2017
@@ -16,11 +16,10 @@
 ; LIN: sarq    $32, %r[[REG2]]
 ; LIN: movslq	%e[[REG4]], %r[[REG3:.+]]
 ; LIN: sarq    $32, %r[[REG4]]
-; LIN: movsd	(%rdi,%r[[REG1]],8), %xmm0
-; LIN: movhpd	(%rdi,%r[[REG2]],8), %xmm0
-; LIN: movsd	(%rdi,%r[[REG3]],8), %xmm1
-; LIN: movhpd	(%rdi,%r[[REG4]],8), %xmm1
-
+; LIN: movsd	(%rdi,%rsi,8), %xmm1
+; LIN: movhpd   (%rdi,%rax,8), %xmm1
+; LIN: movdqa   (%rsi), %xmm0 
+; LIN: movq     %rdi, %xmm1  
 ; WIN: movdqa	(%rdx), %xmm0
 ; WIN: pand 	(%r8), %xmm0
 ; WIN: pextrq	$1, %xmm0, %r[[REG4:.+]]
@@ -29,10 +28,10 @@
 ; WIN: sarq    $32, %r[[REG2]]
 ; WIN: movslq	%e[[REG4]], %r[[REG3:.+]]
 ; WIN: sarq    $32, %r[[REG4]]
-; WIN: movsd	(%rcx,%r[[REG1]],8), %xmm0
-; WIN: movhpd	(%rcx,%r[[REG2]],8), %xmm0
-; WIN: movsd	(%rcx,%r[[REG3]],8), %xmm1
-; WIN: movhpd	(%rcx,%r[[REG4]],8), %xmm1
+; WIN: movsd    (%rcx,%r9,8), %xmm1
+; WIN: movhpd   (%rcx,%rax,8), %xmm1
+; WIN: movdqa   (%rdx), %xmm0
+; WIN: movq     %rdx, %xmm1 
 
 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
   %a = load <4 x i32>, <4 x i32>* %i

Modified: llvm/trunk/test/CodeGen/X86/half.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/half.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/half.ll (original)
+++ llvm/trunk/test/CodeGen/X86/half.ll Tue Jun 27 08:05:13 2017
@@ -1,266 +1,834 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN:   | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN:   | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN:    | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0  \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -fixup-byte-word-insts=1 \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,NOF16-BWINSTS
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -fixup-byte-word-insts=0 \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF,NOF16-NOBWINSTS
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -fixup-byte-word-insts=1 \
+; RUN:    | FileCheck %s -check-prefixes=CHECK,BWON,CHECK-F16C
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -fixup-byte-word-insts=0  \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-I686
 
-define void @test_load_store(half* %in, half* %out) {
-; CHECK-LABEL: test_load_store:
-; BWON:  movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
-; CHECK: movw %ax, (%rsi)
+define void @test_load_store(half* %in, half* %out) #0 {
+; BWON-LABEL: test_load_store:
+; BWON:       # BB#0:
+; BWON-NEXT:    movzwl (%rdi), %eax
+; BWON-NEXT:    movw %ax, (%rsi)
+; BWON-NEXT:    retq
+;
+; BWOFF-LABEL: test_load_store:
+; BWOFF:       # BB#0:
+; BWOFF-NEXT:    movw (%rdi), %ax
+; BWOFF-NEXT:    movw %ax, (%rsi)
+; BWOFF-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_load_store:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw (%ecx), %cx
+; CHECK-I686-NEXT:    movw %cx, (%eax)
+; CHECK-I686-NEXT:    retl
   %val = load half, half* %in
   store half %val, half* %out
   ret void
 }
 
-define i16 @test_bitcast_from_half(half* %addr) {
-; CHECK-LABEL: test_bitcast_from_half:
-; BWON:  movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
+define i16 @test_bitcast_from_half(half* %addr) #0 {
+; BWON-LABEL: test_bitcast_from_half:
+; BWON:       # BB#0:
+; BWON-NEXT:    movzwl (%rdi), %eax
+; BWON-NEXT:    retq
+;
+; BWOFF-LABEL: test_bitcast_from_half:
+; BWOFF:       # BB#0:
+; BWOFF-NEXT:    movw (%rdi), %ax
+; BWOFF-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_bitcast_from_half:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movw (%eax), %ax
+; CHECK-I686-NEXT:    retl
   %val = load half, half* %addr
   %val_int = bitcast half %val to i16
   ret i16 %val_int
 }
 
-define void @test_bitcast_to_half(half* %addr, i16 %in) {
+define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
 ; CHECK-LABEL: test_bitcast_to_half:
-; CHECK: movw %si, (%rdi)
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw %si, (%rdi)
+; CHECK-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_bitcast_to_half:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
+; CHECK-I686-NEXT:    retl
   %val_fp = bitcast i16 %in to half
   store half %val_fp, half* %addr
   ret void
 }
 
-define float @test_extend32(half* %addr) {
-; CHECK-LABEL: test_extend32:
-
-; CHECK-LIBCALL: jmp __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
+define float @test_extend32(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend32:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    jmp __gnu_h2f_ieee # TAILCALL
+;
+; CHECK-F16C-LABEL: test_extend32:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_extend32:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    retl
   %val16 = load half, half* %addr
   %val32 = fpext half %val16 to float
   ret float %val32
 }
 
-define double @test_extend64(half* %addr) {
-; CHECK-LABEL: test_extend64:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtss2sd
+define double @test_extend64(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend64:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rax
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:    popq %rax
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_extend64:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_extend64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    retl
   %val16 = load half, half* %addr
   %val32 = fpext half %val16 to double
   ret double %val32
 }
 
-define void @test_trunc32(float %in, half* %addr) {
-; CHECK-LABEL: test_trunc32:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
+define void @test_trunc32(float %in, half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_trunc32:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_trunc32:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovd %xmm0, %eax
+; CHECK-F16C-NEXT:    movw %ax, (%rdi)
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_trunc32:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $8, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, (%esi)
+; CHECK-I686-NEXT:    addl $8, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %val16 = fptrunc float %in to half
   store half %val16, half* %addr
   ret void
 }
 
-define void @test_trunc64(double %in, half* %addr) {
+define void @test_trunc64(double %in, half* %addr) #0 {
 ; CHECK-LABEL: test_trunc64:
-
-; CHECK-LIBCALL: callq __truncdfhf2
-; CHECK-F16C: callq __truncdfhf2
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    callq __truncdfhf2
+; CHECK-NEXT:    movw %ax, (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_trunc64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $8, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movsd %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movw %ax, (%esi)
+; CHECK-I686-NEXT:    addl $8, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %val16 = fptrunc double %in to half
   store half %val16, half* %addr
   ret void
 }
 
 define i64 @test_fptosi_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptosi_i64:
-
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptosi_i64:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rax
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT:    popq %rcx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_fptosi_i64:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vcvttss2si %xmm0, %rax
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_fptosi_i64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps (%esp)
+; CHECK-I686-NEXT:    calll __fixsfdi
+; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    retl
   %a = load half, half* %p, align 2
   %r = fptosi half %a to i64
   ret i64 %r
 }
 
 define void @test_sitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_sitofp_i64:
-
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK_LIBCALL-NEXT: popq [[ADDR]]
-; CHECK_LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_sitofp_i64:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT:    cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_sitofp_i64:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovd %xmm0, %eax
+; CHECK-F16C-NEXT:    movw %ax, (%rsi)
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_sitofp_i64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $24, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, (%esi)
+; CHECK-I686-NEXT:    addl $24, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %r = sitofp i64 %a to half
   store half %r, half* %p
   ret void
 }
 
 define i64 @test_fptoui_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptoui_i64:
-
-; FP_TO_UINT is expanded using FP_TO_SINT
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss {{.[A-Z_0-9]+}}(%rip), [[REG1:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movaps %xmm0, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: subss [[REG1]], [[REG2]]
-; CHECK-LIBCALL-NEXT: cvttss2si [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movabsq  $-9223372036854775808, [[REG4:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: xorq [[REG3]], [[REG4]]
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0
-; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]]
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovss {{.[A-Z_0-9]+}}(%rip), [[REG3:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vsubss [[REG3]], [[REG2]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG4]], [[REG5:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: movabsq $-9223372036854775808, [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: xorq [[REG5]], [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], [[REG7:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vucomiss [[REG3]], [[REG2]]
-; CHECK-F16C-NEXT: cmovaeq [[REG6]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptoui_i64:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rax
+; CHECK-LIBCALL-NEXT:    movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, %xmm2
+; CHECK-LIBCALL-NEXT:    subss %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT:    cvttss2si %xmm2, %rcx
+; CHECK-LIBCALL-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-LIBCALL-NEXT:    cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT:    xorq %rcx, %rdx
+; CHECK-LIBCALL-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT:    cmovaeq %rdx, %rax
+; CHECK-LIBCALL-NEXT:    popq %rcx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_fptoui_i64:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-F16C-NEXT:    vsubss %xmm1, %xmm0, %xmm2
+; CHECK-F16C-NEXT:    vcvttss2si %xmm2, %rcx
+; CHECK-F16C-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-F16C-NEXT:    vcvttss2si %xmm0, %rax
+; CHECK-F16C-NEXT:    xorq %rcx, %rdx
+; CHECK-F16C-NEXT:    vucomiss %xmm1, %xmm0
+; CHECK-F16C-NEXT:    cmovaeq %rdx, %rax
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_fptoui_i64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps (%esp)
+; CHECK-I686-NEXT:    calll __fixunssfdi
+; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    retl
   %a = load half, half* %p, align 2
   %r = fptoui half %a to i64
   ret i64 %r
 }
 
 define void @test_uitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]]
-
-; simple conversion to float if non-negative
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]]
-; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]]
-
-; convert using shift+or if negative
-; CHECK-NEXT: [[LABEL1]]:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]]
-; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]]
-; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]]
-
-; convert float to half
-; CHECK-NEXT: [[LABEL2]]:
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-NEXT: retq
-
+; CHECK-LIBCALL-LABEL: test_uitofp_i64:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT:    testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT:    js .LBB10_1
+; CHECK-LIBCALL-NEXT:  # BB#2:
+; CHECK-LIBCALL-NEXT:    cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    jmp .LBB10_3
+; CHECK-LIBCALL-NEXT:  .LBB10_1:
+; CHECK-LIBCALL-NEXT:    movq %rdi, %rax
+; CHECK-LIBCALL-NEXT:    shrq %rax
+; CHECK-LIBCALL-NEXT:    andl $1, %edi
+; CHECK-LIBCALL-NEXT:    orq %rax, %rdi
+; CHECK-LIBCALL-NEXT:    cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    addss %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:  .LBB10_3:
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_uitofp_i64:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    testq %rdi, %rdi
+; CHECK-F16C-NEXT:    js .LBB10_1
+; CHECK-F16C-NEXT:  # BB#2:
+; CHECK-F16C-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    jmp .LBB10_3
+; CHECK-F16C-NEXT:  .LBB10_1:
+; CHECK-F16C-NEXT:    movq %rdi, %rax
+; CHECK-F16C-NEXT:    shrq %rax
+; CHECK-F16C-NEXT:    andl $1, %edi
+; CHECK-F16C-NEXT:    orq %rax, %rdi
+; CHECK-F16C-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT:  .LBB10_3:
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovd %xmm0, %eax
+; CHECK-F16C-NEXT:    movw %ax, (%rsi)
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_uitofp_i64:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $24, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    xorl %eax, %eax
+; CHECK-I686-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    setns %al
+; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; CHECK-I686-NEXT:    fstps (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, (%esi)
+; CHECK-I686-NEXT:    addl $24, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %r = uitofp i64 %a to half
   store half %r, half* %p
   ret void
 }
 
 define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
+; CHECK-LIBCALL-LABEL: test_extend32_vec4:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    subq $48, %rsp
+; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT:    movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-LIBCALL-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-LIBCALL-NEXT:    insertps $32, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    # xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; CHECK-LIBCALL-NEXT:    insertps $48, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    # xmm1 = xmm1[0,1,2],mem[0]
+; CHECK-LIBCALL-NEXT:    movaps %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT:    addq $48, %rsp
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_extend32_vec4:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl 6(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    movswl 4(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm1
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm2
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
+; CHECK-F16C-NEXT:    movswl 2(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm3
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
+; CHECK-F16C-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; CHECK-F16C-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; CHECK-F16C-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_extend32_vec4:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $56, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movzwl 2(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl 4(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl 6(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    movzwl (%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-I686-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT:    addl $56, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, <4 x half>* %p, align 8
   %b = fpext <4 x half> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend64_vec4
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C: vcvtss2sd
+; CHECK-LIBCALL-LABEL: test_extend64_vec4:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    subq $16, %rsp
+; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT:    movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm1
+; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm1, %xmm1
+; CHECK-LIBCALL-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-LIBCALL-NEXT:    addq $16, %rsp
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_extend64_vec4:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    movswl 2(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm1
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT:    movswl 4(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm2
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
+; CHECK-F16C-NEXT:    movswl 6(%rdi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm3
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
+; CHECK-F16C-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; CHECK-F16C-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; CHECK-F16C-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-F16C-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-F16C-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_extend64_vec4:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $88, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    movzwl 6(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl 4(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl 2(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl (%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-I686-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-I686-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-I686-NEXT:    addl $88, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, <4 x half>* %p, align 8
   %b = fpext <4 x half> %a to <4 x double>
   ret <4 x double> %b
 }
 
-define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
+; NOF16-BWINSTS-LABEL: test_trunc32_vec4:
+; NOF16-BWINSTS:       # BB#0:
+; NOF16-BWINSTS-NEXT:    pushq %rbp
+; NOF16-BWINSTS-NEXT:    pushq %r15
+; NOF16-BWINSTS-NEXT:    pushq %r14
+; NOF16-BWINSTS-NEXT:    pushq %rbx
+; NOF16-BWINSTS-NEXT:    subq $24, %rsp
+; NOF16-BWINSTS-NEXT:    movq %rdi, %rbx
+; NOF16-BWINSTS-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NOF16-BWINSTS-NEXT:    callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT:    movl %eax, %r14d
+; NOF16-BWINSTS-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT:    callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT:    movl %eax, %r15d
+; NOF16-BWINSTS-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOF16-BWINSTS-NEXT:    callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT:    movl %eax, %ebp
+; NOF16-BWINSTS-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT:    movw %ax, (%rbx)
+; NOF16-BWINSTS-NEXT:    movw %bp, 6(%rbx)
+; NOF16-BWINSTS-NEXT:    movw %r15w, 4(%rbx)
+; NOF16-BWINSTS-NEXT:    movw %r14w, 2(%rbx)
+; NOF16-BWINSTS-NEXT:    addq $24, %rsp
+; NOF16-BWINSTS-NEXT:    popq %rbx
+; NOF16-BWINSTS-NEXT:    popq %r14
+; NOF16-BWINSTS-NEXT:    popq %r15
+; NOF16-BWINSTS-NEXT:    popq %rbp
+; NOF16-BWINSTS-NEXT:    retq
+;
+; BWOFF-LABEL: test_trunc32_vec4:
+; BWOFF:       # BB#0:
+; BWOFF-NEXT:    pushq %rbp
+; BWOFF-NEXT:    pushq %r15
+; BWOFF-NEXT:    pushq %r14
+; BWOFF-NEXT:    pushq %rbx
+; BWOFF-NEXT:    subq $24, %rsp
+; BWOFF-NEXT:    movq %rdi, %rbx
+; BWOFF-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; BWOFF-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee
+; BWOFF-NEXT:    movw %ax, %r14w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee
+; BWOFF-NEXT:    movw %ax, %r15w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWOFF-NEXT:    callq __gnu_f2h_ieee
+; BWOFF-NEXT:    movw %ax, %bp
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __gnu_f2h_ieee
+; BWOFF-NEXT:    movw %ax, (%rbx)
+; BWOFF-NEXT:    movw %bp, 6(%rbx)
+; BWOFF-NEXT:    movw %r15w, 4(%rbx)
+; BWOFF-NEXT:    movw %r14w, 2(%rbx)
+; BWOFF-NEXT:    addq $24, %rsp
+; BWOFF-NEXT:    popq %rbx
+; BWOFF-NEXT:    popq %r14
+; BWOFF-NEXT:    popq %r15
+; BWOFF-NEXT:    popq %rbp
+; BWOFF-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_trunc32_vec4:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vmovd %xmm1, %eax
+; CHECK-F16C-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vmovd %xmm1, %ecx
+; CHECK-F16C-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vmovd %xmm1, %edx
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovd %xmm0, %esi
+; CHECK-F16C-NEXT:    movw %si, (%rdi)
+; CHECK-F16C-NEXT:    movw %dx, 6(%rdi)
+; CHECK-F16C-NEXT:    movw %cx, 4(%rdi)
+; CHECK-F16C-NEXT:    movw %ax, 2(%rdi)
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_trunc32_vec4:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %ebp
+; CHECK-I686-NEXT:    pushl %ebx
+; CHECK-I686-NEXT:    pushl %edi
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $44, %esp
+; CHECK-I686-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT:    movaps %xmm0, %xmm1
+; CHECK-I686-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-I686-NEXT:    movss %xmm1, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %si
+; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %di
+; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, %bx
+; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movw %ax, (%ebp)
+; CHECK-I686-NEXT:    movw %bx, 6(%ebp)
+; CHECK-I686-NEXT:    movw %di, 4(%ebp)
+; CHECK-I686-NEXT:    movw %si, 2(%ebp)
+; CHECK-I686-NEXT:    addl $44, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
+; CHECK-I686-NEXT:    popl %ebx
+; CHECK-I686-NEXT:    popl %ebp
+; CHECK-I686-NEXT:    retl
   %v = fptrunc <4 x float> %a to <4 x half>
   store <4 x half> %v, <4 x half>* %p
   ret void
 }
 
-define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc64_vec4:
-; CHECK: callq  __truncdfhf2
-; CHECK: callq  __truncdfhf2
-; CHECK: callq  __truncdfhf2
-; CHECK: callq  __truncdfhf2
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
+; NOF16-BWINSTS-LABEL: test_trunc64_vec4:
+; NOF16-BWINSTS:       # BB#0:
+; NOF16-BWINSTS-NEXT:    pushq %rbp
+; NOF16-BWINSTS-NEXT:    pushq %r15
+; NOF16-BWINSTS-NEXT:    pushq %r14
+; NOF16-BWINSTS-NEXT:    pushq %rbx
+; NOF16-BWINSTS-NEXT:    subq $40, %rsp
+; NOF16-BWINSTS-NEXT:    movq %rdi, %rbx
+; NOF16-BWINSTS-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT:    callq __truncdfhf2
+; NOF16-BWINSTS-NEXT:    movl %eax, %r14d
+; NOF16-BWINSTS-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT:    callq __truncdfhf2
+; NOF16-BWINSTS-NEXT:    movl %eax, %r15d
+; NOF16-BWINSTS-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    callq __truncdfhf2
+; NOF16-BWINSTS-NEXT:    movl %eax, %ebp
+; NOF16-BWINSTS-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT:    callq __truncdfhf2
+; NOF16-BWINSTS-NEXT:    movw %ax, 4(%rbx)
+; NOF16-BWINSTS-NEXT:    movw %bp, (%rbx)
+; NOF16-BWINSTS-NEXT:    movw %r15w, 6(%rbx)
+; NOF16-BWINSTS-NEXT:    movw %r14w, 2(%rbx)
+; NOF16-BWINSTS-NEXT:    addq $40, %rsp
+; NOF16-BWINSTS-NEXT:    popq %rbx
+; NOF16-BWINSTS-NEXT:    popq %r14
+; NOF16-BWINSTS-NEXT:    popq %r15
+; NOF16-BWINSTS-NEXT:    popq %rbp
+; NOF16-BWINSTS-NEXT:    retq
+;
+; BWOFF-LABEL: test_trunc64_vec4:
+; BWOFF:       # BB#0:
+; BWOFF-NEXT:    pushq %rbp
+; BWOFF-NEXT:    pushq %r15
+; BWOFF-NEXT:    pushq %r14
+; BWOFF-NEXT:    pushq %rbx
+; BWOFF-NEXT:    subq $40, %rsp
+; BWOFF-NEXT:    movq %rdi, %rbx
+; BWOFF-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; BWOFF-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __truncdfhf2
+; BWOFF-NEXT:    movw %ax, %r14w
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT:    callq __truncdfhf2
+; BWOFF-NEXT:    movw %ax, %r15w
+; BWOFF-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __truncdfhf2
+; BWOFF-NEXT:    movw %ax, %bp
+; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    callq __truncdfhf2
+; BWOFF-NEXT:    movw %ax, 4(%rbx)
+; BWOFF-NEXT:    movw %bp, (%rbx)
+; BWOFF-NEXT:    movw %r15w, 6(%rbx)
+; BWOFF-NEXT:    movw %r14w, 2(%rbx)
+; BWOFF-NEXT:    addq $40, %rsp
+; BWOFF-NEXT:    popq %rbx
+; BWOFF-NEXT:    popq %r14
+; BWOFF-NEXT:    popq %r15
+; BWOFF-NEXT:    popq %rbp
+; BWOFF-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_trunc64_vec4:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    pushq %rbp
+; CHECK-F16C-NEXT:    pushq %r15
+; CHECK-F16C-NEXT:    pushq %r14
+; CHECK-F16C-NEXT:    pushq %rbx
+; CHECK-F16C-NEXT:    subq $88, %rsp
+; CHECK-F16C-NEXT:    movq %rdi, %rbx
+; CHECK-F16C-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-F16C-NEXT:    vzeroupper
+; CHECK-F16C-NEXT:    callq __truncdfhf2
+; CHECK-F16C-NEXT:    movl %eax, %r14d
+; CHECK-F16C-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-F16C-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-F16C-NEXT:    vzeroupper
+; CHECK-F16C-NEXT:    callq __truncdfhf2
+; CHECK-F16C-NEXT:    movl %eax, %r15d
+; CHECK-F16C-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-F16C-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-F16C-NEXT:    vzeroupper
+; CHECK-F16C-NEXT:    callq __truncdfhf2
+; CHECK-F16C-NEXT:    movl %eax, %ebp
+; CHECK-F16C-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-F16C-NEXT:    callq __truncdfhf2
+; CHECK-F16C-NEXT:    movw %ax, 4(%rbx)
+; CHECK-F16C-NEXT:    movw %bp, (%rbx)
+; CHECK-F16C-NEXT:    movw %r15w, 6(%rbx)
+; CHECK-F16C-NEXT:    movw %r14w, 2(%rbx)
+; CHECK-F16C-NEXT:    addq $88, %rsp
+; CHECK-F16C-NEXT:    popq %rbx
+; CHECK-F16C-NEXT:    popq %r14
+; CHECK-F16C-NEXT:    popq %r15
+; CHECK-F16C-NEXT:    popq %rbp
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_trunc64_vec4:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    pushl %ebp
+; CHECK-I686-NEXT:    pushl %ebx
+; CHECK-I686-NEXT:    pushl %edi
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $60, %esp
+; CHECK-I686-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movw %ax, %si
+; CHECK-I686-NEXT:    movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movw %ax, %di
+; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movw %ax, %bx
+; CHECK-I686-NEXT:    movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movw %ax, 6(%ebp)
+; CHECK-I686-NEXT:    movw %bx, 4(%ebp)
+; CHECK-I686-NEXT:    movw %di, 2(%ebp)
+; CHECK-I686-NEXT:    movw %si, (%ebp)
+; CHECK-I686-NEXT:    addl $60, %esp
+; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
+; CHECK-I686-NEXT:    popl %ebx
+; CHECK-I686-NEXT:    popl %ebp
+; CHECK-I686-NEXT:    retl
   %v = fptrunc <4 x double> %a to <4 x half>
   store <4 x half> %v, <4 x half>* %p
   ret void
@@ -268,44 +836,99 @@ define void @test_trunc64_vec4(<4 x doub
 
 declare float @test_floatret();
 
-; On i686, if SSE2 is available, the return value from test_floatret is loaded
-; to f80 and then rounded to f32.  The DAG combiner should not combine this
-; fp_round and the subsequent fptrunc from float to half.
 define half @test_f80trunc_nodagcombine() #0 {
-; CHECK-LABEL: test_f80trunc_nodagcombine:
-; CHECK-I686-NOT: calll __truncxfhf2
+; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rax
+; CHECK-LIBCALL-NEXT:    callq test_floatret
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    popq %rax
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_f80trunc_nodagcombine:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    pushq %rax
+; CHECK-F16C-NEXT:    callq test_floatret
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    popq %rax
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $12, %esp
+; CHECK-I686-NEXT:    calll test_floatret
+; CHECK-I686-NEXT:    fstps (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movzwl %ax, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    addl $12, %esp
+; CHECK-I686-NEXT:    retl
   %1 = call float @test_floatret()
   %2 = fptrunc float %1 to half
   ret half %2
 }
 
-; CHECK-LABEL: test_sitofp_fadd_i32:
 
-; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movl %edi, %ebx
-; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
-; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: addq $16, %rsp
-; CHECK-LIBCALL-NEXT: popq %rbx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rsi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
-; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-F16C-NEXT: retq
+
 
 define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
+; CHECK-LIBCALL:       # BB#0:
+; CHECK-LIBCALL-NEXT:    pushq %rbx
+; CHECK-LIBCALL-NEXT:    subq $16, %rsp
+; CHECK-LIBCALL-NEXT:    movl %edi, %ebx
+; CHECK-LIBCALL-NEXT:    movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    addq $16, %rsp
+; CHECK-LIBCALL-NEXT:    popq %rbx
+; CHECK-LIBCALL-NEXT:    retq
+;
+; CHECK-F16C-LABEL: test_sitofp_fadd_i32:
+; CHECK-F16C:       # BB#0:
+; CHECK-F16C-NEXT:    movswl (%rsi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtsi2ssl %edi, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-F16C-NEXT:    retq
+;
+; CHECK-I686-LABEL: test_sitofp_fadd_i32:
+; CHECK-I686:       # BB#0:
+; CHECK-I686-NEXT:    subl $28, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    movzwl (%eax), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-I686-NEXT:    xorps %xmm0, %xmm0
+; CHECK-I686-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
+; CHECK-I686-NEXT:    movzwl %ax, %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload
+; CHECK-I686-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    addl $28, %esp
+; CHECK-I686-NEXT:    retl
   %tmp0 = load half, half* %b
   %tmp1 = sitofp i32 %a to half
   %tmp2 = fadd half %tmp0, %tmp1

Modified: llvm/trunk/test/CodeGen/X86/illegal-bitfield-loadstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/illegal-bitfield-loadstore.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/illegal-bitfield-loadstore.ll (original)
+++ llvm/trunk/test/CodeGen/X86/illegal-bitfield-loadstore.ll Tue Jun 27 08:05:13 2017
@@ -112,23 +112,23 @@ define void @i56_and_or(i56* %a) {
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i56_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl 4(%rdi), %ecx
-; CHECK-NEXT:    movzbl 6(%rdi), %edx
-; CHECK-NEXT:    movl (%rdi), %esi
-; CHECK-NEXT:    movb %dl, 6(%rdi)
-; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shlq $32, %rdx
-; CHECK-NEXT:    orq %rdx, %rsi
-; CHECK-NEXT:    shlq $13, %rax
-; CHECK-NEXT:    movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    movl %ecx, (%rdi)
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movw %cx, 4(%rdi)
+; CHECK-NEXT:    movzwl 4(%rdi), %eax
+; CHECK-NEXT:    movzbl 6(%rdi), %ecx
+; CHECK-NEXT:    movl (%rdi), %edx
+; CHECK-NEXT:    movb %cl, 6(%rdi)
+; CHECK-NEXT:    movzbl %sil, %esi
+; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    shlq $13, %rsi
+; CHECK-NEXT:    movabsq $72057594037919743, %rax # imm = 0xFFFFFFFFFFDFFF
+; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movw %ax, 4(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1

Modified: llvm/trunk/test/CodeGen/X86/mul-constant-i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul-constant-i32.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul-constant-i32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mul-constant-i32.ll Tue Jun 27 08:05:13 2017
@@ -17,7 +17,7 @@ define i32 @test_mul_by_1(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_1:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_1:
 ; X64-JAG:       # BB#0:
@@ -32,7 +32,7 @@ define i32 @test_mul_by_1(i32 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_1:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_1:
 ; JAG-NOOPT:       # BB#0:
@@ -63,7 +63,7 @@ define i32 @test_mul_by_2(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_2:
 ; X64-JAG:       # BB#0:
@@ -81,7 +81,7 @@ define i32 @test_mul_by_2(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_2:
 ; JAG-NOOPT:       # BB#0:
@@ -114,7 +114,7 @@ define i32 @test_mul_by_3(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_3:
 ; X64-JAG:       # BB#0:
@@ -131,7 +131,7 @@ define i32 @test_mul_by_3(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_3:
 ; JAG-NOOPT:       # BB#0:
@@ -165,7 +165,7 @@ define i32 @test_mul_by_4(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_4:
 ; X64-JAG:       # BB#0:
@@ -183,7 +183,7 @@ define i32 @test_mul_by_4(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_4:
 ; JAG-NOOPT:       # BB#0:
@@ -216,7 +216,7 @@ define i32 @test_mul_by_5(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_5:
 ; X64-JAG:       # BB#0:
@@ -233,7 +233,7 @@ define i32 @test_mul_by_5(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_5:
 ; JAG-NOOPT:       # BB#0:
@@ -269,7 +269,7 @@ define i32 @test_mul_by_6(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_6:
 ; X64-JAG:       # BB#0:
@@ -285,8 +285,8 @@ define i32 @test_mul_by_6(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_6:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_6:
 ; JAG-NOOPT:       # BB#0:
@@ -321,7 +321,7 @@ define i32 @test_mul_by_7(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_7:
 ; X64-JAG:       # BB#0:
@@ -337,8 +337,8 @@ define i32 @test_mul_by_7(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_7:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_7:
 ; JAG-NOOPT:       # BB#0:
@@ -371,7 +371,7 @@ define i32 @test_mul_by_8(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_8:
 ; X64-JAG:       # BB#0:
@@ -389,7 +389,7 @@ define i32 @test_mul_by_8(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_8:
 ; JAG-NOOPT:       # BB#0:
@@ -422,7 +422,7 @@ define i32 @test_mul_by_9(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_9:
 ; X64-JAG:       # BB#0:
@@ -439,7 +439,7 @@ define i32 @test_mul_by_9(i32 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_9:
 ; JAG-NOOPT:       # BB#0:
@@ -475,7 +475,7 @@ define i32 @test_mul_by_10(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_10:
 ; X64-JAG:       # BB#0:
@@ -491,8 +491,8 @@ define i32 @test_mul_by_10(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_10:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_10:
 ; JAG-NOOPT:       # BB#0:
@@ -527,7 +527,7 @@ define i32 @test_mul_by_11(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_11:
 ; X64-JAG:       # BB#0:
@@ -543,8 +543,8 @@ define i32 @test_mul_by_11(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_11:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_11:
 ; JAG-NOOPT:       # BB#0:
@@ -575,9 +575,9 @@ define i32 @test_mul_by_12(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_12:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:1.00]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_12:
 ; X64-JAG:       # BB#0:
@@ -593,8 +593,8 @@ define i32 @test_mul_by_12(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_12:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_12:
 ; JAG-NOOPT:       # BB#0:
@@ -629,7 +629,7 @@ define i32 @test_mul_by_13(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_13:
 ; X64-JAG:       # BB#0:
@@ -645,8 +645,8 @@ define i32 @test_mul_by_13(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_13:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_13:
 ; JAG-NOOPT:       # BB#0:
@@ -681,7 +681,7 @@ define i32 @test_mul_by_14(i32 %x) {
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_14:
 ; X64-JAG:       # BB#0:
@@ -698,8 +698,8 @@ define i32 @test_mul_by_14(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_14:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_14:
 ; JAG-NOOPT:       # BB#0:
@@ -732,7 +732,7 @@ define i32 @test_mul_by_15(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_15:
 ; X64-JAG:       # BB#0:
@@ -748,8 +748,8 @@ define i32 @test_mul_by_15(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_15:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_15:
 ; JAG-NOOPT:       # BB#0:
@@ -780,9 +780,9 @@ define i32 @test_mul_by_16(i32 %x) {
 ;
 ; X64-HSW-LABEL: test_mul_by_16:
 ; X64-HSW:       # BB#0:
-; X64-HSW-NEXT:    shll $4, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $4, %edi # sched: [1:1.00]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # BB#0:
@@ -798,9 +798,9 @@ define i32 @test_mul_by_16(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_16:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:1.00]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # BB#0:
@@ -836,9 +836,9 @@ define i32 @test_mul_by_17(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    shll $4, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $4, %eax # sched: [1:1.00]
 ; X64-HSW-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_17:
 ; X64-JAG:       # BB#0:
@@ -855,8 +855,8 @@ define i32 @test_mul_by_17(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_17:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_17:
 ; JAG-NOOPT:       # BB#0:
@@ -892,7 +892,7 @@ define i32 @test_mul_by_18(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_18:
 ; X64-JAG:       # BB#0:
@@ -908,8 +908,8 @@ define i32 @test_mul_by_18(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_18:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_18:
 ; JAG-NOOPT:       # BB#0:
@@ -944,9 +944,9 @@ define i32 @test_mul_by_19(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    shll $2, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $2, %eax # sched: [1:1.00]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_19:
 ; X64-JAG:       # BB#0:
@@ -963,8 +963,8 @@ define i32 @test_mul_by_19(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_19:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_19:
 ; JAG-NOOPT:       # BB#0:
@@ -995,9 +995,9 @@ define i32 @test_mul_by_20(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_20:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:1.00]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_20:
 ; X64-JAG:       # BB#0:
@@ -1013,8 +1013,8 @@ define i32 @test_mul_by_20(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_20:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_20:
 ; JAG-NOOPT:       # BB#0:
@@ -1049,7 +1049,7 @@ define i32 @test_mul_by_21(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_21:
 ; X64-JAG:       # BB#0:
@@ -1065,8 +1065,8 @@ define i32 @test_mul_by_21(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_21:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_21:
 ; JAG-NOOPT:       # BB#0:
@@ -1101,7 +1101,7 @@ define i32 @test_mul_by_22(i32 %x) {
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_22:
 ; X64-JAG:       # BB#0:
@@ -1118,8 +1118,8 @@ define i32 @test_mul_by_22(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_22:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_22:
 ; JAG-NOOPT:       # BB#0:
@@ -1152,9 +1152,9 @@ define i32 @test_mul_by_23(i32 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    shll $3, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $3, %eax # sched: [1:1.00]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_23:
 ; X64-JAG:       # BB#0:
@@ -1171,8 +1171,8 @@ define i32 @test_mul_by_23(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_23:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_23:
 ; JAG-NOOPT:       # BB#0:
@@ -1203,9 +1203,9 @@ define i32 @test_mul_by_24(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_24:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT:    shll $3, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $3, %edi # sched: [1:1.00]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_24:
 ; X64-JAG:       # BB#0:
@@ -1221,8 +1221,8 @@ define i32 @test_mul_by_24(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_24:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_24:
 ; JAG-NOOPT:       # BB#0:
@@ -1257,7 +1257,7 @@ define i32 @test_mul_by_25(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_25:
 ; X64-JAG:       # BB#0:
@@ -1273,8 +1273,8 @@ define i32 @test_mul_by_25(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_25:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_25:
 ; JAG-NOOPT:       # BB#0:
@@ -1311,7 +1311,7 @@ define i32 @test_mul_by_26(i32 %x) {
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_26:
 ; X64-JAG:       # BB#0:
@@ -1328,8 +1328,8 @@ define i32 @test_mul_by_26(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_26:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_26:
 ; JAG-NOOPT:       # BB#0:
@@ -1362,7 +1362,7 @@ define i32 @test_mul_by_27(i32 %x) {
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_27:
 ; X64-JAG:       # BB#0:
@@ -1378,8 +1378,8 @@ define i32 @test_mul_by_27(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_27:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_27:
 ; JAG-NOOPT:       # BB#0:
@@ -1416,7 +1416,7 @@ define i32 @test_mul_by_28(i32 %x) {
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_28:
 ; X64-JAG:       # BB#0:
@@ -1433,8 +1433,8 @@ define i32 @test_mul_by_28(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_28:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_28:
 ; JAG-NOOPT:       # BB#0:
@@ -1471,7 +1471,7 @@ define i32 @test_mul_by_29(i32 %x) {
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
@@ -1489,8 +1489,8 @@ define i32 @test_mul_by_29(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_29:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_29:
 ; JAG-NOOPT:       # BB#0:
@@ -1523,10 +1523,10 @@ define i32 @test_mul_by_30(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_30:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:1.00]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_30:
 ; X64-JAG:       # BB#0:
@@ -1543,8 +1543,8 @@ define i32 @test_mul_by_30(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_30:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_30:
 ; JAG-NOOPT:       # BB#0:
@@ -1576,9 +1576,9 @@ define i32 @test_mul_by_31(i32 %x) {
 ; X64-HSW-LABEL: test_mul_by_31:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:1.00]
 ; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_31:
 ; X64-JAG:       # BB#0:
@@ -1594,8 +1594,8 @@ define i32 @test_mul_by_31(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_31:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_31:
 ; JAG-NOOPT:       # BB#0:
@@ -1626,9 +1626,9 @@ define i32 @test_mul_by_32(i32 %x) {
 ;
 ; X64-HSW-LABEL: test_mul_by_32:
 ; X64-HSW:       # BB#0:
-; X64-HSW-NEXT:    shll $5, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $5, %edi # sched: [1:1.00]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # BB#0:
@@ -1644,9 +1644,9 @@ define i32 @test_mul_by_32(i32 %x) {
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_32:
 ; HSW-NOOPT:       # BB#0:
-; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:1.00]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # BB#0:
@@ -1686,8 +1686,8 @@ define i32 @test_mul_spec(i32 %x) nounwi
 ; X64-HSW-NEXT:    addl $42, %ecx # sched: [1:0.25]
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addl $2, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_spec:
 ; X64-JAG:       # BB#0:
@@ -1712,8 +1712,8 @@ define i32 @test_mul_spec(i32 %x) nounwi
 ; HSW-NOOPT-NEXT:    addl $42, %ecx # sched: [1:0.25]
 ; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    addl $2, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_spec:
 ; JAG-NOOPT:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mul-constant-i64.ll Tue Jun 27 08:05:13 2017
@@ -18,7 +18,7 @@ define i64 @test_mul_by_1(i64 %x) nounwi
 ; X64-HSW-LABEL: test_mul_by_1:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_1:
 ; X64-JAG:       # BB#0:
@@ -34,7 +34,7 @@ define i64 @test_mul_by_1(i64 %x) nounwi
 ; HSW-NOOPT-LABEL: test_mul_by_1:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_1:
 ; JAG-NOOPT:       # BB#0:
@@ -66,7 +66,7 @@ define i64 @test_mul_by_2(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_2:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_2:
 ; X64-JAG:       # BB#0:
@@ -84,7 +84,7 @@ define i64 @test_mul_by_2(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_2:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_2:
 ; JAG-NOOPT:       # BB#0:
@@ -116,7 +116,7 @@ define i64 @test_mul_by_3(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_3:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_3:
 ; X64-JAG:       # BB#0:
@@ -134,7 +134,7 @@ define i64 @test_mul_by_3(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_3:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_3:
 ; JAG-NOOPT:       # BB#0:
@@ -166,7 +166,7 @@ define i64 @test_mul_by_4(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_4:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_4:
 ; X64-JAG:       # BB#0:
@@ -184,7 +184,7 @@ define i64 @test_mul_by_4(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_4:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_4:
 ; JAG-NOOPT:       # BB#0:
@@ -216,7 +216,7 @@ define i64 @test_mul_by_5(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_5:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_5:
 ; X64-JAG:       # BB#0:
@@ -234,7 +234,7 @@ define i64 @test_mul_by_5(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_5:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_5:
 ; JAG-NOOPT:       # BB#0:
@@ -268,7 +268,7 @@ define i64 @test_mul_by_6(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_6:
 ; X64-JAG:       # BB#0:
@@ -287,7 +287,7 @@ define i64 @test_mul_by_6(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_6:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_6:
 ; JAG-NOOPT:       # BB#0:
@@ -323,7 +323,7 @@ define i64 @test_mul_by_7(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_7:
 ; X64-JAG:       # BB#0:
@@ -342,7 +342,7 @@ define i64 @test_mul_by_7(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_7:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_7:
 ; JAG-NOOPT:       # BB#0:
@@ -375,7 +375,7 @@ define i64 @test_mul_by_8(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_8:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_8:
 ; X64-JAG:       # BB#0:
@@ -393,7 +393,7 @@ define i64 @test_mul_by_8(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_8:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_8:
 ; JAG-NOOPT:       # BB#0:
@@ -425,7 +425,7 @@ define i64 @test_mul_by_9(i64 %x) {
 ; X64-HSW-LABEL: test_mul_by_9:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_9:
 ; X64-JAG:       # BB#0:
@@ -443,7 +443,7 @@ define i64 @test_mul_by_9(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_9:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_9:
 ; JAG-NOOPT:       # BB#0:
@@ -477,7 +477,7 @@ define i64 @test_mul_by_10(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_10:
 ; X64-JAG:       # BB#0:
@@ -496,7 +496,7 @@ define i64 @test_mul_by_10(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_10:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_10:
 ; JAG-NOOPT:       # BB#0:
@@ -532,7 +532,7 @@ define i64 @test_mul_by_11(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_11:
 ; X64-JAG:       # BB#0:
@@ -551,7 +551,7 @@ define i64 @test_mul_by_11(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_11:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_11:
 ; JAG-NOOPT:       # BB#0:
@@ -585,7 +585,7 @@ define i64 @test_mul_by_12(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_12:
 ; X64-JAG:       # BB#0:
@@ -604,7 +604,7 @@ define i64 @test_mul_by_12(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_12:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_12:
 ; JAG-NOOPT:       # BB#0:
@@ -640,7 +640,7 @@ define i64 @test_mul_by_13(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_13:
 ; X64-JAG:       # BB#0:
@@ -659,7 +659,7 @@ define i64 @test_mul_by_13(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_13:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_13:
 ; JAG-NOOPT:       # BB#0:
@@ -696,7 +696,7 @@ define i64 @test_mul_by_14(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_14:
 ; X64-JAG:       # BB#0:
@@ -716,7 +716,7 @@ define i64 @test_mul_by_14(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_14:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_14:
 ; JAG-NOOPT:       # BB#0:
@@ -751,7 +751,7 @@ define i64 @test_mul_by_15(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_15:
 ; X64-JAG:       # BB#0:
@@ -770,7 +770,7 @@ define i64 @test_mul_by_15(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_15:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_15:
 ; JAG-NOOPT:       # BB#0:
@@ -804,7 +804,7 @@ define i64 @test_mul_by_16(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # BB#0:
@@ -824,7 +824,7 @@ define i64 @test_mul_by_16(i64 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # BB#0:
@@ -864,7 +864,7 @@ define i64 @test_mul_by_17(i64 %x) {
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_17:
 ; X64-JAG:       # BB#0:
@@ -884,7 +884,7 @@ define i64 @test_mul_by_17(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_17:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_17:
 ; JAG-NOOPT:       # BB#0:
@@ -920,7 +920,7 @@ define i64 @test_mul_by_18(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_18:
 ; X64-JAG:       # BB#0:
@@ -939,7 +939,7 @@ define i64 @test_mul_by_18(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_18:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_18:
 ; JAG-NOOPT:       # BB#0:
@@ -977,7 +977,7 @@ define i64 @test_mul_by_19(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shlq $2, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_19:
 ; X64-JAG:       # BB#0:
@@ -997,7 +997,7 @@ define i64 @test_mul_by_19(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_19:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_19:
 ; JAG-NOOPT:       # BB#0:
@@ -1031,7 +1031,7 @@ define i64 @test_mul_by_20(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_20:
 ; X64-JAG:       # BB#0:
@@ -1050,7 +1050,7 @@ define i64 @test_mul_by_20(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_20:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_20:
 ; JAG-NOOPT:       # BB#0:
@@ -1086,7 +1086,7 @@ define i64 @test_mul_by_21(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_21:
 ; X64-JAG:       # BB#0:
@@ -1105,7 +1105,7 @@ define i64 @test_mul_by_21(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_21:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_21:
 ; JAG-NOOPT:       # BB#0:
@@ -1142,7 +1142,7 @@ define i64 @test_mul_by_22(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_22:
 ; X64-JAG:       # BB#0:
@@ -1162,7 +1162,7 @@ define i64 @test_mul_by_22(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_22:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_22:
 ; JAG-NOOPT:       # BB#0:
@@ -1199,7 +1199,7 @@ define i64 @test_mul_by_23(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    shlq $3, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_23:
 ; X64-JAG:       # BB#0:
@@ -1219,7 +1219,7 @@ define i64 @test_mul_by_23(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_23:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_23:
 ; JAG-NOOPT:       # BB#0:
@@ -1253,7 +1253,7 @@ define i64 @test_mul_by_24(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $3, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_24:
 ; X64-JAG:       # BB#0:
@@ -1272,7 +1272,7 @@ define i64 @test_mul_by_24(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_24:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_24:
 ; JAG-NOOPT:       # BB#0:
@@ -1308,7 +1308,7 @@ define i64 @test_mul_by_25(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_25:
 ; X64-JAG:       # BB#0:
@@ -1327,7 +1327,7 @@ define i64 @test_mul_by_25(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_25:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_25:
 ; JAG-NOOPT:       # BB#0:
@@ -1365,7 +1365,7 @@ define i64 @test_mul_by_26(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_26:
 ; X64-JAG:       # BB#0:
@@ -1385,7 +1385,7 @@ define i64 @test_mul_by_26(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_26:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_26:
 ; JAG-NOOPT:       # BB#0:
@@ -1420,7 +1420,7 @@ define i64 @test_mul_by_27(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_27:
 ; X64-JAG:       # BB#0:
@@ -1439,7 +1439,7 @@ define i64 @test_mul_by_27(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_27:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_27:
 ; JAG-NOOPT:       # BB#0:
@@ -1477,7 +1477,7 @@ define i64 @test_mul_by_28(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_28:
 ; X64-JAG:       # BB#0:
@@ -1497,7 +1497,7 @@ define i64 @test_mul_by_28(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_28:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_28:
 ; JAG-NOOPT:       # BB#0:
@@ -1536,7 +1536,7 @@ define i64 @test_mul_by_29(i64 %x) {
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
@@ -1557,7 +1557,7 @@ define i64 @test_mul_by_29(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_29:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_29:
 ; JAG-NOOPT:       # BB#0:
@@ -1596,7 +1596,7 @@ define i64 @test_mul_by_30(i64 %x) {
 ; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_30:
 ; X64-JAG:       # BB#0:
@@ -1617,7 +1617,7 @@ define i64 @test_mul_by_30(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_30:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_30:
 ; JAG-NOOPT:       # BB#0:
@@ -1654,7 +1654,7 @@ define i64 @test_mul_by_31(i64 %x) {
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_31:
 ; X64-JAG:       # BB#0:
@@ -1674,7 +1674,7 @@ define i64 @test_mul_by_31(i64 %x) {
 ; HSW-NOOPT-LABEL: test_mul_by_31:
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_31:
 ; JAG-NOOPT:       # BB#0:
@@ -1709,7 +1709,7 @@ define i64 @test_mul_by_32(i64 %x) {
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # BB#0:
@@ -1729,7 +1729,7 @@ define i64 @test_mul_by_32(i64 %x) {
 ; HSW-NOOPT:       # BB#0:
 ; HSW-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # BB#0:
@@ -1793,7 +1793,7 @@ define i64 @test_mul_spec(i64 %x) nounwi
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    addq $2, %rax # sched: [1:0.25]
 ; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    retq # sched: [2:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_spec:
 ; X64-JAG:       # BB#0:
@@ -1841,7 +1841,7 @@ define i64 @test_mul_spec(i64 %x) nounwi
 ; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    addq $2, %rax # sched: [1:0.25]
 ; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [2:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_spec:
 ; JAG-NOOPT:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/pr32329.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr32329.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr32329.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr32329.ll Tue Jun 27 08:05:13 2017
@@ -59,8 +59,8 @@ define void @foo() local_unnamed_addr {
 ; X86-NEXT:    cmovnel %ecx, %esi
 ; X86-NEXT:    cmpl %edx, %edi
 ; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %esi, var_50
 ; X86-NEXT:    setge var_205
+; X86-NEXT:    movl %esi, var_50
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    movb %bl, var_218
 ; X86-NEXT:    popl %esi

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Tue Jun 27 08:05:13 2017
@@ -45,15 +45,15 @@ define float @f32_no_estimate(float %x)
 ;
 ; SANDY-LABEL: f32_no_estimate:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_no_estimate:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -63,9 +63,9 @@ define float @f32_no_estimate(float %x)
 ;
 ; AVX512-LABEL: f32_no_estimate:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
+; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -113,18 +113,18 @@ define float @f32_one_step(float %x) #1
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_one_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -139,9 +139,9 @@ define float @f32_one_step(float %x) #1
 ; AVX512-LABEL: f32_one_step:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -207,7 +207,7 @@ define float @f32_two_step(float %x) #2
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -215,18 +215,18 @@ define float @f32_two_step(float %x) #2
 ; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_two_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -245,13 +245,13 @@ define float @f32_two_step(float %x) #2
 ; AVX512-LABEL: f32_two_step:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -284,15 +284,15 @@ define <4 x float> @v4f32_no_estimate(<4
 ;
 ; SANDY-LABEL: v4f32_no_estimate:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_no_estimate:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -302,9 +302,9 @@ define <4 x float> @v4f32_no_estimate(<4
 ;
 ; AVX512-LABEL: v4f32_no_estimate:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
-; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1 # sched: [?:5.000000e-01]
+; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -350,21 +350,21 @@ define <4 x float> @v4f32_one_step(<4 x
 ;
 ; SANDY-LABEL: v4f32_one_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [7:3.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -379,17 +379,17 @@ define <4 x float> @v4f32_one_step(<4 x
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -453,9 +453,9 @@ define <4 x float> @v4f32_two_step(<4 x
 ;
 ; SANDY-LABEL: v4f32_two_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [7:3.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -463,18 +463,18 @@ define <4 x float> @v4f32_two_step(<4 x
 ; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -493,24 +493,24 @@ define <4 x float> @v4f32_two_step(<4 x
 ; KNL-LABEL: v4f32_two_step:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_two_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -546,15 +546,15 @@ define <8 x float> @v8f32_no_estimate(<8
 ;
 ; SANDY-LABEL: v8f32_no_estimate:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_no_estimate:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -565,8 +565,8 @@ define <8 x float> @v8f32_no_estimate(<8
 ; AVX512-LABEL: v8f32_no_estimate:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
-; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -621,19 +621,19 @@ define <8 x float> @v8f32_one_step(<8 x
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -647,18 +647,18 @@ define <8 x float> @v8f32_one_step(<8 x
 ;
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -737,7 +737,7 @@ define <8 x float> @v8f32_two_step(<8 x
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -745,18 +745,18 @@ define <8 x float> @v8f32_two_step(<8 x
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
@@ -774,25 +774,25 @@ define <8 x float> @v8f32_two_step(<8 x
 ;
 ; KNL-LABEL: v8f32_two_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_two_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll Tue Jun 27 08:05:13 2017
@@ -39,26 +39,26 @@ define float @f32_no_step_2(float %x) #3
 ; SANDY-LABEL: f32_no_step_2:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_no_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_no_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_no_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 1234.0, %x
   ret float %div
 }
@@ -110,39 +110,39 @@ define float @f32_one_step_2(float %x) #
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_one_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
@@ -198,43 +198,43 @@ define float @f32_one_step_2_divs(float
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_one_step_2_divs:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 3456.0, %x
   %div2 = fdiv fast float %div, %x
   ret float %div2
@@ -305,7 +305,7 @@ define float @f32_two_step_2(float %x) #
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -313,26 +313,26 @@ define float @f32_two_step_2(float %x) #
 ; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -340,20 +340,20 @@ define float @f32_two_step_2(float %x) #
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; AVX512-LABEL: f32_two_step_2:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT:    retq # sched: [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
@@ -403,51 +403,51 @@ define <4 x float> @v4f32_one_step2(<4 x
 ;
 ; SANDY-LABEL: v4f32_one_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [7:3.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -501,56 +501,56 @@ define <4 x float> @v4f32_one_step_2_div
 ;
 ; SANDY-LABEL: v4f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [7:3.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
 ; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   %div2 = fdiv fast <4 x float> %div, %x
   ret <4 x float> %div2
@@ -619,9 +619,9 @@ define <4 x float> @v4f32_two_step2(<4 x
 ;
 ; SANDY-LABEL: v4f32_two_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [7:3.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -629,26 +629,26 @@ define <4 x float> @v4f32_two_step2(<4 x
 ; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3 # sched: [?:5.000000e-01]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -656,32 +656,32 @@ define <4 x float> @v4f32_two_step2(<4 x
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v4f32_two_step2:
 ; KNL:       # BB#0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -741,49 +741,49 @@ define <8 x float> @v8f32_one_step2(<8 x
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -848,54 +848,54 @@ define <8 x float> @v8f32_one_step_2_div
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:2.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   %div2 = fdiv fast <8 x float> %div, %x
   ret <8 x float> %div2
@@ -980,7 +980,7 @@ define <8 x float> @v8f32_two_step2(<8 x
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -988,59 +988,59 @@ define <8 x float> @v8f32_two_step2(<8 x
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_two_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
 ; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -1070,27 +1070,27 @@ define <8 x float> @v8f32_no_step(<8 x f
 ; SANDY-LABEL: v8f32_no_step:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_no_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_no_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_no_step:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -1125,32 +1125,32 @@ define <8 x float> @v8f32_no_step2(<8 x
 ; SANDY-LABEL: v8f32_no_step2:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_no_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq # sched: [2:1.00]
 ;
 ; KNL-LABEL: v8f32_no_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT:    retq # sched: [1:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
 ;
 ; SKX-LABEL: v8f32_no_step2:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [1:1.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [2:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }

Modified: llvm/trunk/test/CodeGen/X86/sse-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-schedule.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll Tue Jun 27 08:05:13 2017
@@ -31,14 +31,14 @@ define <4 x float> @test_addps(<4 x floa
 ; SANDY-LABEL: test_addps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # BB#0:
@@ -73,14 +73,14 @@ define float @test_addss(float %a0, floa
 ; SANDY-LABEL: test_addss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addss:
 ; BTVER2:       # BB#0:
@@ -122,15 +122,15 @@ define <4 x float> @test_andps(<4 x floa
 ;
 ; SANDY-LABEL: test_andps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # BB#0:
@@ -176,15 +176,15 @@ define <4 x float> @test_andnotps(<4 x f
 ;
 ; SANDY-LABEL: test_andnotps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andnotps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # BB#0:
@@ -228,16 +228,16 @@ define <4 x float> @test_cmpps(<4 x floa
 ; SANDY-LABEL: test_cmpps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # BB#0:
@@ -277,13 +277,13 @@ define float @test_cmpss(float %a0, floa
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmpss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpss:
 ; BTVER2:       # BB#0:
@@ -347,30 +347,30 @@ define i32 @test_comiss(<4 x float> %a0,
 ; SANDY-LABEL: test_comiss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %cl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %cl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %cl # sched: [1:0.33]
 ; SANDY-NEXT:    vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %dl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %dl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    orb %cl, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_comiss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %cl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %cl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; HASWELL-NEXT:    vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %dl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %dl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_comiss:
 ; BTVER2:       # BB#0:
@@ -417,17 +417,17 @@ define float @test_cvtsi2ss(i32 %a0, i32
 ;
 ; SANDY-LABEL: test_cvtsi2ss:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsi2ss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ss:
 ; BTVER2:       # BB#0:
@@ -466,17 +466,17 @@ define float @test_cvtsi2ssq(i64 %a0, i6
 ;
 ; SANDY-LABEL: test_cvtsi2ssq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsi2ssq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
 ; HASWELL-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ssq:
 ; BTVER2:       # BB#0:
@@ -515,17 +515,17 @@ define i32 @test_cvtss2si(float %a0, flo
 ;
 ; SANDY-LABEL: test_cvtss2si:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT:    vcvtss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT:    vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT:    vcvtss2si (%rdi), %eax # sched: [10:1.00]
 ; SANDY-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtss2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtss2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2si:
 ; BTVER2:       # BB#0:
@@ -567,17 +567,17 @@ define i64 @test_cvtss2siq(float %a0, fl
 ;
 ; SANDY-LABEL: test_cvtss2siq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT:    vcvtss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT:    vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT:    vcvtss2si (%rdi), %rax # sched: [10:1.00]
 ; SANDY-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtss2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtss2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2siq:
 ; BTVER2:       # BB#0:
@@ -619,17 +619,17 @@ define i32 @test_cvttss2si(float %a0, fl
 ;
 ; SANDY-LABEL: test_cvttss2si:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT:    vcvttss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT:    vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT:    vcvttss2si (%rdi), %eax # sched: [10:1.00]
 ; SANDY-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttss2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttss2si (%rdi), %eax # sched: [4:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2si:
 ; BTVER2:       # BB#0:
@@ -668,17 +668,17 @@ define i64 @test_cvttss2siq(float %a0, f
 ;
 ; SANDY-LABEL: test_cvttss2siq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT:    vcvttss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT:    vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT:    vcvttss2si (%rdi), %rax # sched: [10:1.00]
 ; SANDY-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttss2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttss2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2siq:
 ; BTVER2:       # BB#0:
@@ -714,15 +714,15 @@ define <4 x float> @test_divps(<4 x floa
 ;
 ; SANDY-LABEL: test_divps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # BB#0:
@@ -756,15 +756,15 @@ define float @test_divss(float %a0, floa
 ;
 ; SANDY-LABEL: test_divss:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divss:
 ; BTVER2:       # BB#0:
@@ -799,14 +799,14 @@ define void @test_ldmxcsr(i32 %a0) {
 ; SANDY-LABEL: test_ldmxcsr:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_ldmxcsr:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; HASWELL-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ldmxcsr:
 ; BTVER2:       # BB#0:
@@ -843,14 +843,14 @@ define <4 x float> @test_maxps(<4 x floa
 ; SANDY-LABEL: test_maxps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # BB#0:
@@ -886,14 +886,14 @@ define <4 x float> @test_maxss(<4 x floa
 ; SANDY-LABEL: test_maxss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxss:
 ; BTVER2:       # BB#0:
@@ -929,14 +929,14 @@ define <4 x float> @test_minps(<4 x floa
 ; SANDY-LABEL: test_minps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # BB#0:
@@ -972,14 +972,14 @@ define <4 x float> @test_minss(<4 x floa
 ; SANDY-LABEL: test_minss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minss:
 ; BTVER2:       # BB#0:
@@ -1017,17 +1017,17 @@ define void @test_movaps(<4 x float> *%a
 ;
 ; SANDY-LABEL: test_movaps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovaps (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovaps %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movaps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovaps (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovaps %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # BB#0:
@@ -1068,12 +1068,12 @@ define <4 x float> @test_movhlps(<4 x fl
 ; SANDY-LABEL: test_movhlps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movhlps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhlps:
 ; BTVER2:       # BB#0:
@@ -1111,17 +1111,17 @@ define void @test_movhps(<4 x float> %a0
 ;
 ; SANDY-LABEL: test_movhps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movhps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhps:
 ; BTVER2:       # BB#0:
@@ -1164,13 +1164,13 @@ define <4 x float> @test_movlhps(<4 x fl
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movlhps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlhps:
 ; BTVER2:       # BB#0:
@@ -1206,17 +1206,17 @@ define void @test_movlps(<4 x float> %a0
 ;
 ; SANDY-LABEL: test_movlps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovlps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movlps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovlps %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlps:
 ; BTVER2:       # BB#0:
@@ -1254,13 +1254,13 @@ define i32 @test_movmskps(<4 x float> %a
 ;
 ; SANDY-LABEL: test_movmskps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovmskps %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovmskps %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # BB#0:
@@ -1295,13 +1295,13 @@ define void @test_movntps(<4 x float> %a
 ;
 ; SANDY-LABEL: test_movntps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovntps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntps %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # BB#0:
@@ -1335,17 +1335,17 @@ define void @test_movss_mem(float* %a0,
 ;
 ; SANDY-LABEL: test_movss_mem:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovss %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movss_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovss %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movss_mem:
 ; BTVER2:       # BB#0:
@@ -1383,13 +1383,13 @@ define <4 x float> @test_movss_reg(<4 x
 ;
 ; SANDY-LABEL: test_movss_reg:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movss_reg:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movss_reg:
 ; BTVER2:       # BB#0:
@@ -1423,17 +1423,17 @@ define void @test_movups(<4 x float> *%a
 ;
 ; SANDY-LABEL: test_movups:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movups:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovups (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovups %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # BB#0:
@@ -1469,14 +1469,14 @@ define <4 x float> @test_mulps(<4 x floa
 ; SANDY-LABEL: test_mulps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # BB#0:
@@ -1511,14 +1511,14 @@ define float @test_mulss(float %a0, floa
 ; SANDY-LABEL: test_mulss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulss:
 ; BTVER2:       # BB#0:
@@ -1560,15 +1560,15 @@ define <4 x float> @test_orps(<4 x float
 ;
 ; SANDY-LABEL: test_orps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_orps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # BB#0:
@@ -1609,13 +1609,13 @@ define void @test_prefetchnta(i8* %a0) {
 ;
 ; SANDY-LABEL: test_prefetchnta:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    prefetchnta (%rdi) # sched: [4:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_prefetchnta:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    prefetchnta (%rdi) # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    prefetchnta (%rdi) # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_prefetchnta:
 ; BTVER2:       # BB#0:
@@ -1652,17 +1652,17 @@ define <4 x float> @test_rcpps(<4 x floa
 ;
 ; SANDY-LABEL: test_rcpps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm0 # sched: [7:3.00]
+; SANDY-NEXT:    vrcpps (%rdi), %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rcpps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vrcpps (%rdi), %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # BB#0:
@@ -1708,18 +1708,18 @@ define <4 x float> @test_rcpss(float %a0
 ; SANDY-LABEL: test_rcpss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rcpss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rcpss:
 ; BTVER2:       # BB#0:
@@ -1765,16 +1765,16 @@ define <4 x float> @test_rsqrtps(<4 x fl
 ; SANDY-LABEL: test_rsqrtps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rsqrtps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # BB#0:
@@ -1819,19 +1819,19 @@ define <4 x float> @test_rsqrtss(float %
 ;
 ; SANDY-LABEL: test_rsqrtss:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_rsqrtss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtss:
 ; BTVER2:       # BB#0:
@@ -1875,12 +1875,12 @@ define void @test_sfence() {
 ; SANDY-LABEL: test_sfence:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    sfence # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    sfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    sfence # sched: [1:0.33]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sfence:
 ; BTVER2:       # BB#0:
@@ -1917,14 +1917,14 @@ define <4 x float> @test_shufps(<4 x flo
 ; SANDY-LABEL: test_shufps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SANDY-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_shufps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # BB#0:
@@ -1962,17 +1962,17 @@ define <4 x float> @test_sqrtps(<4 x flo
 ;
 ; SANDY-LABEL: test_sqrtps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [14:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # BB#0:
@@ -2017,19 +2017,19 @@ define <4 x float> @test_sqrtss(<4 x flo
 ;
 ; SANDY-LABEL: test_sqrtss:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT:    vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; SANDY-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT:    vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT:    vmovaps (%rdi), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtss:
 ; BTVER2:       # BB#0:
@@ -2067,15 +2067,15 @@ define i32 @test_stmxcsr() {
 ;
 ; SANDY-LABEL: test_stmxcsr:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_stmxcsr:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
-; HASWELL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_stmxcsr:
 ; BTVER2:       # BB#0:
@@ -2112,14 +2112,14 @@ define <4 x float> @test_subps(<4 x floa
 ; SANDY-LABEL: test_subps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # BB#0:
@@ -2154,14 +2154,14 @@ define float @test_subss(float %a0, floa
 ; SANDY-LABEL: test_subss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subss:
 ; BTVER2:       # BB#0:
@@ -2220,30 +2220,30 @@ define i32 @test_ucomiss(<4 x float> %a0
 ; SANDY-LABEL: test_ucomiss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %cl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %cl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %cl # sched: [1:0.33]
 ; SANDY-NEXT:    vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %dl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %dl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    orb %cl, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_ucomiss:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %cl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %cl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; HASWELL-NEXT:    vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %dl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %dl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ucomiss:
 ; BTVER2:       # BB#0:
@@ -2292,14 +2292,14 @@ define <4 x float> @test_unpckhps(<4 x f
 ; SANDY-LABEL: test_unpckhps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SANDY-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpckhps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # BB#0:
@@ -2338,14 +2338,14 @@ define <4 x float> @test_unpcklps(<4 x f
 ; SANDY-LABEL: test_unpcklps:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpcklps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # BB#0:
@@ -2387,15 +2387,15 @@ define <4 x float> @test_xorps(<4 x floa
 ;
 ; SANDY-LABEL: test_xorps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_xorps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-schedule.ll?rev=306414&r1=306413&r2=306414&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll Tue Jun 27 08:05:13 2017
@@ -31,14 +31,14 @@ define <2 x double> @test_addpd(<2 x dou
 ; SANDY-LABEL: test_addpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # BB#0:
@@ -73,14 +73,14 @@ define double @test_addsd(double %a0, do
 ; SANDY-LABEL: test_addsd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_addsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_addsd:
 ; BTVER2:       # BB#0:
@@ -117,17 +117,17 @@ define <2 x double> @test_andpd(<2 x dou
 ;
 ; SANDY-LABEL: test_andpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # BB#0:
@@ -170,17 +170,17 @@ define <2 x double> @test_andnotpd(<2 x
 ;
 ; SANDY-LABEL: test_andnotpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_andnotpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # BB#0:
@@ -226,16 +226,16 @@ define <2 x double> @test_cmppd(<2 x dou
 ; SANDY-LABEL: test_cmppd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmppd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # BB#0:
@@ -275,13 +275,13 @@ define double @test_cmpsd(double %a0, do
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cmpsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; HASWELL-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cmpsd:
 ; BTVER2:       # BB#0:
@@ -345,30 +345,30 @@ define i32 @test_comisd(<2 x double> %a0
 ; SANDY-LABEL: test_comisd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %cl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %cl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %cl # sched: [1:0.33]
 ; SANDY-NEXT:    vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %dl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %dl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    orb %cl, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_comisd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %cl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %cl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; HASWELL-NEXT:    vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %dl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %dl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_comisd:
 ; BTVER2:       # BB#0:
@@ -416,16 +416,16 @@ define <2 x double> @test_cvtdq2pd(<4 x
 ; SANDY-LABEL: test_cvtdq2pd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtdq2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # BB#0:
@@ -467,17 +467,17 @@ define <4 x float> @test_cvtdq2ps(<4 x i
 ;
 ; SANDY-LABEL: test_cvtdq2ps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtdq2ps:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # BB#0:
@@ -517,17 +517,17 @@ define <4 x i32> @test_cvtpd2dq(<2 x dou
 ;
 ; SANDY-LABEL: test_cvtpd2dq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # BB#0:
@@ -568,17 +568,17 @@ define <4 x float> @test_cvtpd2ps(<2 x d
 ;
 ; SANDY-LABEL: test_cvtpd2ps:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtpd2ps:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # BB#0:
@@ -620,16 +620,16 @@ define <4 x i32> @test_cvtps2dq(<4 x flo
 ; SANDY-LABEL: test_cvtps2dq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # BB#0:
@@ -670,17 +670,17 @@ define <2 x double> @test_cvtps2pd(<4 x
 ;
 ; SANDY-LABEL: test_cvtps2pd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
 ; SANDY-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtps2pd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2pd:
 ; BTVER2:       # BB#0:
@@ -724,14 +724,14 @@ define i32 @test_cvtsd2si(double %a0, do
 ; SANDY-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
 ; SANDY-NEXT:    vcvtsd2si (%rdi), %eax # sched: [7:1.00]
 ; SANDY-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsd2si:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
+; HASWELL-NEXT:    vcvtsd2si (%rdi), %eax # sched: [7:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2si:
 ; BTVER2:       # BB#0:
@@ -773,17 +773,17 @@ define i64 @test_cvtsd2siq(double %a0, d
 ;
 ; SANDY-LABEL: test_cvtsd2siq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT:    vcvtsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT:    vcvtsd2si (%rdi), %rax # sched: [10:1.00]
 ; SANDY-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsd2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvtsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvtsd2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2siq:
 ; BTVER2:       # BB#0:
@@ -830,18 +830,18 @@ define float @test_cvtsd2ss(double %a0,
 ; SANDY-LABEL: test_cvtsd2ss:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; SANDY-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsd2ss:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
-; HASWELL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2ss:
 ; BTVER2:       # BB#0:
@@ -882,16 +882,16 @@ define double @test_cvtsi2sd(i32 %a0, i3
 ; SANDY-LABEL: test_cvtsi2sd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsi2sd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sd:
 ; BTVER2:       # BB#0:
@@ -931,16 +931,16 @@ define double @test_cvtsi2sdq(i64 %a0, i
 ; SANDY-LABEL: test_cvtsi2sdq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtsi2sdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
 ; HASWELL-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sdq:
 ; BTVER2:       # BB#0:
@@ -985,19 +985,19 @@ define double @test_cvtss2sd(float %a0,
 ;
 ; SANDY-LABEL: test_cvtss2sd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
 ; SANDY-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvtss2sd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
 ; HASWELL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2sd:
 ; BTVER2:       # BB#0:
@@ -1038,17 +1038,17 @@ define <4 x i32> @test_cvttpd2dq(<2 x do
 ;
 ; SANDY-LABEL: test_cvttpd2dq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttpd2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # BB#0:
@@ -1091,16 +1091,16 @@ define <4 x i32> @test_cvttps2dq(<4 x fl
 ; SANDY-LABEL: test_cvttps2dq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttps2dq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # BB#0:
@@ -1139,17 +1139,17 @@ define i32 @test_cvttsd2si(double %a0, d
 ;
 ; SANDY-LABEL: test_cvttsd2si:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
+; SANDY-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
 ; SANDY-NEXT:    vcvttsd2si (%rdi), %eax # sched: [7:1.00]
 ; SANDY-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttsd2si:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttsd2si (%rdi), %eax # sched: [7:1.00]
 ; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2si:
 ; BTVER2:       # BB#0:
@@ -1188,17 +1188,17 @@ define i64 @test_cvttsd2siq(double %a0,
 ;
 ; SANDY-LABEL: test_cvttsd2siq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT:    vcvttsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT:    vcvttsd2si (%rdi), %rax # sched: [10:1.00]
 ; SANDY-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_cvttsd2siq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT:    vcvttsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT:    vcvttsd2si (%rdi), %rax # sched: [4:1.00]
 ; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2siq:
 ; BTVER2:       # BB#0:
@@ -1234,15 +1234,15 @@ define <2 x double> @test_divpd(<2 x dou
 ;
 ; SANDY-LABEL: test_divpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # BB#0:
@@ -1276,15 +1276,15 @@ define double @test_divsd(double %a0, do
 ;
 ; SANDY-LABEL: test_divsd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_divsd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_divsd:
 ; BTVER2:       # BB#0:
@@ -1322,12 +1322,12 @@ define void @test_lfence() {
 ; SANDY-LABEL: test_lfence:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    lfence # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_lfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    lfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    lfence # sched: [2:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_lfence:
 ; BTVER2:       # BB#0:
@@ -1363,12 +1363,12 @@ define void @test_mfence() {
 ; SANDY-LABEL: test_mfence:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    mfence # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mfence:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    mfence # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    mfence # sched: [2:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mfence:
 ; BTVER2:       # BB#0:
@@ -1402,12 +1402,12 @@ define void @test_maskmovdqu(<16 x i8> %
 ; SANDY-LABEL: test_maskmovdqu:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maskmovdqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovdqu:
 ; BTVER2:       # BB#0:
@@ -1440,14 +1440,14 @@ define <2 x double> @test_maxpd(<2 x dou
 ; SANDY-LABEL: test_maxpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # BB#0:
@@ -1483,14 +1483,14 @@ define <2 x double> @test_maxsd(<2 x dou
 ; SANDY-LABEL: test_maxsd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_maxsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_maxsd:
 ; BTVER2:       # BB#0:
@@ -1526,14 +1526,14 @@ define <2 x double> @test_minpd(<2 x dou
 ; SANDY-LABEL: test_minpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # BB#0:
@@ -1569,14 +1569,14 @@ define <2 x double> @test_minsd(<2 x dou
 ; SANDY-LABEL: test_minsd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_minsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_minsd:
 ; BTVER2:       # BB#0:
@@ -1614,17 +1614,17 @@ define void @test_movapd(<2 x double> *%
 ;
 ; SANDY-LABEL: test_movapd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovapd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovapd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movapd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovapd (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovapd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # BB#0:
@@ -1662,17 +1662,17 @@ define void @test_movdqa(<2 x i64> *%a0,
 ;
 ; SANDY-LABEL: test_movdqa:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movdqa:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movdqa:
 ; BTVER2:       # BB#0:
@@ -1710,17 +1710,17 @@ define void @test_movdqu(<2 x i64> *%a0,
 ;
 ; SANDY-LABEL: test_movdqu:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movdqu:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movdqu:
 ; BTVER2:       # BB#0:
@@ -1768,22 +1768,22 @@ define i32 @test_movd(<4 x i32> %a0, i32
 ; SANDY-LABEL: test_movd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmovd %edi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT:    vmovd %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovd %edi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovd %edi, %xmm1 # sched: [1:0.25]
+; HASWELL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovd %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovd %xmm1, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # BB#0:
@@ -1838,23 +1838,23 @@ define i64 @test_movd_64(<2 x i64> %a0,
 ;
 ; SANDY-LABEL: test_movd_64:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovq %rdi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; SANDY-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovq %xmm0, %rax # sched: [1:0.33]
-; SANDY-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
+; SANDY-NEXT:    vmovq %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movd_64:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vmovq %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovq %xmm1, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movd_64:
 ; BTVER2:       # BB#0:
@@ -1900,17 +1900,17 @@ define void @test_movhpd(<2 x double> %a
 ;
 ; SANDY-LABEL: test_movhpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movhpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movhpd:
 ; BTVER2:       # BB#0:
@@ -1951,17 +1951,17 @@ define void @test_movlpd(<2 x double> %a
 ;
 ; SANDY-LABEL: test_movlpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movlpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movlpd:
 ; BTVER2:       # BB#0:
@@ -1998,13 +1998,13 @@ define i32 @test_movmskpd(<2 x double> %
 ;
 ; SANDY-LABEL: test_movmskpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovmskpd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movmskpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovmskpd %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # BB#0:
@@ -2039,14 +2039,14 @@ define void @test_movntdqa(<2 x i64> %a0
 ; SANDY-LABEL: test_movntdqa:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntdqa:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntdqa:
 ; BTVER2:       # BB#0:
@@ -2080,14 +2080,14 @@ define void @test_movntpd(<2 x double> %
 ; SANDY-LABEL: test_movntpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movntpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # BB#0:
@@ -2123,17 +2123,17 @@ define <2 x i64> @test_movq_mem(<2 x i64
 ;
 ; SANDY-LABEL: test_movq_mem:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movq_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovq %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movq_mem:
 ; BTVER2:       # BB#0:
@@ -2174,13 +2174,13 @@ define <2 x i64> @test_movq_reg(<2 x i64
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
 ; SANDY-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movq_reg:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
 ; HASWELL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movq_reg:
 ; BTVER2:       # BB#0:
@@ -2216,17 +2216,17 @@ define void @test_movsd_mem(double* %a0,
 ;
 ; SANDY-LABEL: test_movsd_mem:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
 ; SANDY-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovsd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movsd_mem:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovsd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_mem:
 ; BTVER2:       # BB#0:
@@ -2266,12 +2266,12 @@ define <2 x double> @test_movsd_reg(<2 x
 ; SANDY-LABEL: test_movsd_reg:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movsd_reg:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_reg:
 ; BTVER2:       # BB#0:
@@ -2305,17 +2305,17 @@ define void @test_movupd(<2 x double> *%
 ;
 ; SANDY-LABEL: test_movupd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT:    vmovupd (%rdi), %xmm0 # sched: [6:0.50]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_movupd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT:    vmovupd (%rdi), %xmm0 # sched: [?:5.000000e-01]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmovupd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # BB#0:
@@ -2351,14 +2351,14 @@ define <2 x double> @test_mulpd(<2 x dou
 ; SANDY-LABEL: test_mulpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # BB#0:
@@ -2393,14 +2393,14 @@ define double @test_mulsd(double %a0, do
 ; SANDY-LABEL: test_mulsd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_mulsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_mulsd:
 ; BTVER2:       # BB#0:
@@ -2437,17 +2437,17 @@ define <2 x double> @test_orpd(<2 x doub
 ;
 ; SANDY-LABEL: test_orpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_orpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_orpd:
 ; BTVER2:       # BB#0:
@@ -2496,14 +2496,14 @@ define <8 x i16> @test_packssdw(<4 x i32
 ; SANDY-LABEL: test_packssdw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packssdw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # BB#0:
@@ -2548,14 +2548,14 @@ define <16 x i8> @test_packsswb(<8 x i16
 ; SANDY-LABEL: test_packsswb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packsswb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # BB#0:
@@ -2600,14 +2600,14 @@ define <16 x i8> @test_packuswb(<8 x i16
 ; SANDY-LABEL: test_packuswb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_packuswb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # BB#0:
@@ -2648,14 +2648,14 @@ define <16 x i8> @test_paddb(<16 x i8> %
 ; SANDY-LABEL: test_paddb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # BB#0:
@@ -2694,14 +2694,14 @@ define <4 x i32> @test_paddd(<4 x i32> %
 ; SANDY-LABEL: test_paddd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # BB#0:
@@ -2736,14 +2736,14 @@ define <2 x i64> @test_paddq(<2 x i64> %
 ; SANDY-LABEL: test_paddq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # BB#0:
@@ -2782,14 +2782,14 @@ define <16 x i8> @test_paddsb(<16 x i8>
 ; SANDY-LABEL: test_paddsb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # BB#0:
@@ -2829,14 +2829,14 @@ define <8 x i16> @test_paddsw(<8 x i16>
 ; SANDY-LABEL: test_paddsw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # BB#0:
@@ -2876,14 +2876,14 @@ define <16 x i8> @test_paddusb(<16 x i8>
 ; SANDY-LABEL: test_paddusb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddusb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # BB#0:
@@ -2923,14 +2923,14 @@ define <8 x i16> @test_paddusw(<8 x i16>
 ; SANDY-LABEL: test_paddusw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddusw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # BB#0:
@@ -2970,14 +2970,14 @@ define <8 x i16> @test_paddw(<8 x i16> %
 ; SANDY-LABEL: test_paddw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_paddw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # BB#0:
@@ -3015,16 +3015,16 @@ define <2 x i64> @test_pand(<2 x i64> %a
 ; SANDY-LABEL: test_pand:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pand:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # BB#0:
@@ -3070,16 +3070,16 @@ define <2 x i64> @test_pandn(<2 x i64> %
 ; SANDY-LABEL: test_pandn:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pandn:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # BB#0:
@@ -3122,14 +3122,14 @@ define <16 x i8> @test_pavgb(<16 x i8> %
 ; SANDY-LABEL: test_pavgb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pavgb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # BB#0:
@@ -3169,14 +3169,14 @@ define <8 x i16> @test_pavgw(<8 x i16> %
 ; SANDY-LABEL: test_pavgw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pavgw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # BB#0:
@@ -3217,16 +3217,16 @@ define <16 x i8> @test_pcmpeqb(<16 x i8>
 ; SANDY-LABEL: test_pcmpeqb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpeqb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # BB#0:
@@ -3269,16 +3269,16 @@ define <4 x i32> @test_pcmpeqd(<4 x i32>
 ; SANDY-LABEL: test_pcmpeqd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpeqd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # BB#0:
@@ -3321,16 +3321,16 @@ define <8 x i16> @test_pcmpeqw(<8 x i16>
 ; SANDY-LABEL: test_pcmpeqw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpeqw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # BB#0:
@@ -3374,16 +3374,16 @@ define <16 x i8> @test_pcmpgtb(<16 x i8>
 ; SANDY-LABEL: test_pcmpgtb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpgtb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # BB#0:
@@ -3427,16 +3427,16 @@ define <4 x i32> @test_pcmpgtd(<4 x i32>
 ; SANDY-LABEL: test_pcmpgtd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpgtd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # BB#0:
@@ -3480,16 +3480,16 @@ define <8 x i16> @test_pcmpgtw(<8 x i16>
 ; SANDY-LABEL: test_pcmpgtw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pcmpgtw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # BB#0:
@@ -3526,15 +3526,15 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ;
 ; SANDY-LABEL: test_pextrw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT:    vpextrw $6, %xmm0, %eax # sched: [3:1.00]
 ; SANDY-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pextrw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT:    vpextrw $6, %xmm0, %eax # sched: [2:1.00]
 ; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # BB#0:
@@ -3570,15 +3570,15 @@ define <8 x i16> @test_pinsrw(<8 x i16>
 ;
 ; SANDY-LABEL: test_pinsrw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pinsrw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # BB#0:
@@ -3620,15 +3620,15 @@ define <4 x i32> @test_pmaddwd(<8 x i16>
 ;
 ; SANDY-LABEL: test_pmaddwd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmaddwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # BB#0:
@@ -3669,14 +3669,14 @@ define <8 x i16> @test_pmaxsw(<8 x i16>
 ; SANDY-LABEL: test_pmaxsw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmaxsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # BB#0:
@@ -3716,14 +3716,14 @@ define <16 x i8> @test_pmaxub(<16 x i8>
 ; SANDY-LABEL: test_pmaxub:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmaxub:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # BB#0:
@@ -3763,14 +3763,14 @@ define <8 x i16> @test_pminsw(<8 x i16>
 ; SANDY-LABEL: test_pminsw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pminsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # BB#0:
@@ -3810,14 +3810,14 @@ define <16 x i8> @test_pminub(<16 x i8>
 ; SANDY-LABEL: test_pminub:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pminub:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # BB#0:
@@ -3851,13 +3851,13 @@ define i32 @test_pmovmskb(<16 x i8> %a0)
 ;
 ; SANDY-LABEL: test_pmovmskb:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmovmskb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmovmskb %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # BB#0:
@@ -3891,13 +3891,13 @@ define <8 x i16> @test_pmulhuw(<8 x i16>
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmulhuw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # BB#0:
@@ -3932,15 +3932,15 @@ define <8 x i16> @test_pmulhw(<8 x i16>
 ;
 ; SANDY-LABEL: test_pmulhw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmulhw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # BB#0:
@@ -3975,15 +3975,15 @@ define <8 x i16> @test_pmullw(<8 x i16>
 ;
 ; SANDY-LABEL: test_pmullw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmullw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # BB#0:
@@ -4027,13 +4027,13 @@ define <2 x i64> @test_pmuludq(<4 x i32>
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmuludq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # BB#0:
@@ -4073,16 +4073,16 @@ define <2 x i64> @test_por(<2 x i64> %a0
 ; SANDY-LABEL: test_por:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_por:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # BB#0:
@@ -4126,15 +4126,15 @@ define <2 x i64> @test_psadbw(<16 x i8>
 ;
 ; SANDY-LABEL: test_psadbw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psadbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # BB#0:
@@ -4176,16 +4176,16 @@ define <4 x i32> @test_pshufd(<4 x i32>
 ; SANDY-LABEL: test_pshufd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
-; SANDY-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50]
+; SANDY-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pshufd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; HASWELL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshufd:
 ; BTVER2:       # BB#0:
@@ -4227,16 +4227,16 @@ define <8 x i16> @test_pshufhw(<8 x i16>
 ; SANDY-LABEL: test_pshufhw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
-; SANDY-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50]
+; SANDY-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pshufhw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshufhw:
 ; BTVER2:       # BB#0:
@@ -4278,16 +4278,16 @@ define <8 x i16> @test_pshuflw(<8 x i16>
 ; SANDY-LABEL: test_pshuflw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50]
+; SANDY-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pshuflw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00]
+; HASWELL-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pshuflw:
 ; BTVER2:       # BB#0:
@@ -4328,15 +4328,15 @@ define <4 x i32> @test_pslld(<4 x i32> %
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; SANDY-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pslld:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # BB#0:
@@ -4378,12 +4378,12 @@ define <4 x i32> @test_pslldq(<4 x i32>
 ; SANDY-LABEL: test_pslldq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pslldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pslldq:
 ; BTVER2:       # BB#0:
@@ -4419,15 +4419,15 @@ define <2 x i64> @test_psllq(<2 x i64> %
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; SANDY-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psllq:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # BB#0:
@@ -4470,15 +4470,15 @@ define <8 x i16> @test_psllw(<8 x i16> %
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; SANDY-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psllw:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # BB#0:
@@ -4519,17 +4519,17 @@ define <4 x i32> @test_psrad(<4 x i32> %
 ;
 ; SANDY-LABEL: test_psrad:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psrad:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # BB#0:
@@ -4570,17 +4570,17 @@ define <8 x i16> @test_psraw(<8 x i16> %
 ;
 ; SANDY-LABEL: test_psraw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psraw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # BB#0:
@@ -4621,17 +4621,17 @@ define <4 x i32> @test_psrld(<4 x i32> %
 ;
 ; SANDY-LABEL: test_psrld:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psrld:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # BB#0:
@@ -4673,12 +4673,12 @@ define <4 x i32> @test_psrldq(<4 x i32>
 ; SANDY-LABEL: test_psrldq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psrldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrldq:
 ; BTVER2:       # BB#0:
@@ -4712,17 +4712,17 @@ define <2 x i64> @test_psrlq(<2 x i64> %
 ;
 ; SANDY-LABEL: test_psrlq:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psrlq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # BB#0:
@@ -4763,17 +4763,17 @@ define <8 x i16> @test_psrlw(<8 x i16> %
 ;
 ; SANDY-LABEL: test_psrlw:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psrlw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
 ; HASWELL-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # BB#0:
@@ -4816,14 +4816,14 @@ define <16 x i8> @test_psubb(<16 x i8> %
 ; SANDY-LABEL: test_psubb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # BB#0:
@@ -4862,14 +4862,14 @@ define <4 x i32> @test_psubd(<4 x i32> %
 ; SANDY-LABEL: test_psubd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # BB#0:
@@ -4904,14 +4904,14 @@ define <2 x i64> @test_psubq(<2 x i64> %
 ; SANDY-LABEL: test_psubq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # BB#0:
@@ -4950,14 +4950,14 @@ define <16 x i8> @test_psubsb(<16 x i8>
 ; SANDY-LABEL: test_psubsb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubsb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # BB#0:
@@ -4997,14 +4997,14 @@ define <8 x i16> @test_psubsw(<8 x i16>
 ; SANDY-LABEL: test_psubsw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubsw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # BB#0:
@@ -5044,14 +5044,14 @@ define <16 x i8> @test_psubusb(<16 x i8>
 ; SANDY-LABEL: test_psubusb:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubusb:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # BB#0:
@@ -5091,14 +5091,14 @@ define <8 x i16> @test_psubusw(<8 x i16>
 ; SANDY-LABEL: test_psubusw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubusw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # BB#0:
@@ -5138,14 +5138,14 @@ define <8 x i16> @test_psubw(<8 x i16> %
 ; SANDY-LABEL: test_psubw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_psubw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # BB#0:
@@ -5184,14 +5184,14 @@ define <16 x i8> @test_punpckhbw(<16 x i
 ; SANDY-LABEL: test_punpckhbw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
-; SANDY-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # BB#0:
@@ -5231,16 +5231,16 @@ define <4 x i32> @test_punpckhdq(<4 x i3
 ; SANDY-LABEL: test_punpckhdq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50]
+; SANDY-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # BB#0:
@@ -5280,16 +5280,16 @@ define <2 x i64> @test_punpckhqdq(<2 x i
 ; SANDY-LABEL: test_punpckhqdq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhqdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhqdq:
 ; BTVER2:       # BB#0:
@@ -5330,14 +5330,14 @@ define <8 x i16> @test_punpckhwd(<8 x i1
 ; SANDY-LABEL: test_punpckhwd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckhwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # BB#0:
@@ -5376,14 +5376,14 @@ define <16 x i8> @test_punpcklbw(<16 x i
 ; SANDY-LABEL: test_punpcklbw:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpcklbw:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # BB#0:
@@ -5423,16 +5423,16 @@ define <4 x i32> @test_punpckldq(<4 x i3
 ; SANDY-LABEL: test_punpckldq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpckldq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # BB#0:
@@ -5472,16 +5472,16 @@ define <2 x i64> @test_punpcklqdq(<2 x i
 ; SANDY-LABEL: test_punpcklqdq:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; SANDY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50]
+; SANDY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpcklqdq:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklqdq:
 ; BTVER2:       # BB#0:
@@ -5522,14 +5522,14 @@ define <8 x i16> @test_punpcklwd(<8 x i1
 ; SANDY-LABEL: test_punpcklwd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_punpcklwd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # BB#0:
@@ -5567,16 +5567,16 @@ define <2 x i64> @test_pxor(<2 x i64> %a
 ; SANDY-LABEL: test_pxor:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SANDY-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pxor:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # BB#0:
@@ -5616,16 +5616,16 @@ define <2 x double> @test_shufpd(<2 x do
 ; SANDY-LABEL: test_shufpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; SANDY-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_shufpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # BB#0:
@@ -5665,17 +5665,17 @@ define <2 x double> @test_sqrtpd(<2 x do
 ;
 ; SANDY-LABEL: test_sqrtpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [28:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtpd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [21:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # BB#0:
@@ -5720,19 +5720,19 @@ define <2 x double> @test_sqrtsd(<2 x do
 ;
 ; SANDY-LABEL: test_sqrtsd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT:    vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; SANDY-NEXT:    vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_sqrtsd:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT:    vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT:    vmovapd (%rdi), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtsd:
 ; BTVER2:       # BB#0:
@@ -5771,14 +5771,14 @@ define <2 x double> @test_subpd(<2 x dou
 ; SANDY-LABEL: test_subpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # BB#0:
@@ -5813,14 +5813,14 @@ define double @test_subsd(double %a0, do
 ; SANDY-LABEL: test_subsd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_subsd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_subsd:
 ; BTVER2:       # BB#0:
@@ -5879,30 +5879,30 @@ define i32 @test_ucomisd(<2 x double> %a
 ; SANDY-LABEL: test_ucomisd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %cl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %cl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %cl # sched: [1:0.33]
 ; SANDY-NEXT:    vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT:    setnp %al # sched: [1:0.33]
-; SANDY-NEXT:    sete %dl # sched: [1:0.33]
+; SANDY-NEXT:    setnp %al # sched: [1:1.00]
+; SANDY-NEXT:    sete %dl # sched: [1:1.00]
 ; SANDY-NEXT:    andb %al, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    orb %cl, %dl # sched: [1:0.33]
 ; SANDY-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_ucomisd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %cl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %cl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; HASWELL-NEXT:    vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT:    setnp %al # sched: [1:0.50]
-; HASWELL-NEXT:    sete %dl # sched: [1:0.50]
+; HASWELL-NEXT:    setnp %al # sched: [1:1.00]
+; HASWELL-NEXT:    sete %dl # sched: [1:1.00]
 ; HASWELL-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; HASWELL-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_ucomisd:
 ; BTVER2:       # BB#0:
@@ -5950,16 +5950,16 @@ define <2 x double> @test_unpckhpd(<2 x
 ; SANDY-LABEL: test_unpckhpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpckhpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # BB#0:
@@ -6005,16 +6005,16 @@ define <2 x double> @test_unpcklpd(<2 x
 ; SANDY-LABEL: test_unpcklpd:
 ; SANDY:       # BB#0:
 ; SANDY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_unpcklpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # BB#0:
@@ -6053,17 +6053,17 @@ define <2 x double> @test_xorpd(<2 x dou
 ;
 ; SANDY-LABEL: test_xorpd:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SANDY-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [5:1.00]
+; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_xorpd:
 ; HASWELL:       # BB#0:
 ; HASWELL-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    retq # sched: [1:1.00]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # BB#0:




More information about the llvm-commits mailing list