[llvm] r353043 - [AsmPrinter] Remove hidden flag -print-schedule.

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 4 04:51:26 PST 2019


Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=353043&r1=353042&r2=353043&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Mon Feb  4 04:51:26 2019
@@ -2,13 +2,13 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
 
 ; If the target's divss/divps instructions are substantially
 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
@@ -26,59 +26,11 @@ define float @f32_no_estimate(float %x)
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-RECIP-LABEL: f32_no_estimate:
-; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    retq
-;
-; FMA-RECIP-LABEL: f32_no_estimate:
-; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    retq
-;
-; BDVER2-LABEL: f32_no_estimate:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
-; BDVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: f32_no_estimate:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; SANDY-LABEL: f32_no_estimate:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: f32_no_estimate:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; HASWELL-NO-FMA-LABEL: f32_no_estimate:
-; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    retq
-;
-; KNL-LABEL: f32_no_estimate:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
-; KNL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_no_estimate:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
-; SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX-LABEL: f32_no_estimate:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -114,37 +66,37 @@ define float @f32_one_step(float %x) #1
 ;
 ; BDVER2-LABEL: f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -156,19 +108,12 @@ define float @f32_one_step(float %x) #1
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: f32_one_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_one_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: f32_one_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; AVX512-NEXT:    retq
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -218,52 +163,52 @@ define float @f32_two_step(float %x) #2
 ;
 ; BDVER2-LABEL: f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_two_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_two_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -279,27 +224,16 @@ define float @f32_two_step(float %x) #2
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: f32_two_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_two_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: f32_two_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; AVX512-NEXT:    retq
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -326,27 +260,27 @@ define <4 x float> @v4f32_no_estimate(<4
 ;
 ; BDVER2-LABEL: v4f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -354,17 +288,11 @@ define <4 x float> @v4f32_no_estimate(<4
 ; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v4f32_no_estimate:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v4f32_no_estimate:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v4f32_no_estimate:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -400,38 +328,38 @@ define <4 x float> @v4f32_one_step(<4 x
 ;
 ; BDVER2-LABEL: v4f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -445,18 +373,18 @@ define <4 x float> @v4f32_one_step(<4 x
 ;
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -506,52 +434,52 @@ define <4 x float> @v4f32_two_step(<4 x
 ;
 ; BDVER2-LABEL: v4f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_two_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -567,27 +495,16 @@ define <4 x float> @v4f32_two_step(<4 x
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v4f32_two_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v4f32_two_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v4f32_two_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %xmm0, %xmm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -617,27 +534,27 @@ define <8 x float> @v8f32_no_estimate(<8
 ;
 ; BDVER2-LABEL: v8f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [9:19.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -645,17 +562,11 @@ define <8 x float> @v8f32_no_estimate(<8
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v8f32_no_estimate:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v8f32_no_estimate:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -698,38 +609,38 @@ define <8 x float> @v8f32_one_step(<8 x
 ;
 ; BDVER2-LABEL: v8f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -743,18 +654,18 @@ define <8 x float> @v8f32_one_step(<8 x
 ;
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -817,52 +728,52 @@ define <8 x float> @v8f32_two_step(<8 x
 ;
 ; BDVER2-LABEL: v8f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_two_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -878,27 +789,16 @@ define <8 x float> @v8f32_two_step(<8 x
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v8f32_two_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v8f32_two_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v8f32_two_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %ymm0, %ymm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %ymm1, %ymm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
+; AVX512-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -936,31 +836,31 @@ define <16 x float> @v16f32_no_estimate(
 ;
 ; BDVER2-LABEL: v16f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [9:19.00]
-; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [9:19.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00]
-; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
-; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00]
-; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -969,17 +869,11 @@ define <16 x float> @v16f32_no_estimate(
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v16f32_no_estimate:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
-; KNL-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_no_estimate:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
-; SKX-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v16f32_no_estimate:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <16 x float> %div
 }
@@ -1045,55 +939,55 @@ define <16 x float> @v16f32_one_step(<16
 ;
 ; BDVER2-LABEL: v16f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm4, %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm4
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm1, %ymm4, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm4
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm1, %ymm4, %ymm1
+; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_one_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vrcpps %ymm1, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_one_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -1110,19 +1004,12 @@ define <16 x float> @v16f32_one_step(<16
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v16f32_one_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_one_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v16f32_one_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <16 x float> %div
 }
@@ -1226,81 +1113,81 @@ define <16 x float> @v16f32_two_step(<16
 ;
 ; BDVER2-LABEL: v16f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_two_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vrcpps %ymm1, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
+; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_two_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_two_step:
 ; HASWELL-NO-FMA:       # %bb.0:
@@ -1325,27 +1212,16 @@ define <16 x float> @v16f32_two_step(<16
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
-; KNL-LABEL: v16f32_two_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
-; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_two_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
-; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX512-LABEL: v16f32_two_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <16 x float> %div
 }

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll?rev=353043&r1=353042&r2=353043&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll Mon Feb  4 04:51:26 2019
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
 
 ; It's the extra tests coverage for recip as discussed on D26855.
 
@@ -19,59 +19,11 @@ define float @f32_no_step_2(float %x) #3
 ; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-RECIP-LABEL: f32_no_step_2:
-; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; AVX-RECIP-NEXT:    retq
-;
-; FMA-RECIP-LABEL: f32_no_step_2:
-; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; FMA-RECIP-NEXT:    retq
-;
-; BDVER2-LABEL: f32_no_step_2:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: f32_no_step_2:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; SANDY-LABEL: f32_no_step_2:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: f32_no_step_2:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; HASWELL-NO-FMA-LABEL: f32_no_step_2:
-; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: f32_no_step_2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_no_step_2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX-LABEL: f32_no_step_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = fdiv fast float 1234.0, %x
   ret float %div
 }
@@ -110,68 +62,60 @@ define float @f32_one_step_2(float %x) #
 ;
 ; BDVER2-LABEL: f32_one_step_2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: f32_one_step_2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_one_step_2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_one_step_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
@@ -213,75 +157,66 @@ define float @f32_one_step_2_divs(float
 ;
 ; BDVER2-LABEL: f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2_divs:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
-; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: f32_one_step_2_divs:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
-; KNL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_one_step_2_divs:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
-; SKX-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_one_step_2_divs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 3456.0, %x
   %div2 = fdiv fast float %div, %x
   ret float %div2
@@ -335,95 +270,83 @@ define float @f32_two_step_2(float %x) #
 ;
 ; BDVER2-LABEL: f32_two_step_2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_two_step_2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
-; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: f32_two_step_2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: f32_two_step_2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_two_step_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
@@ -462,70 +385,70 @@ define <4 x float> @v4f32_one_step2(<4 x
 ;
 ; BDVER2-LABEL: v4f32_one_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -567,77 +490,77 @@ define <4 x float> @v4f32_one_step_2_div
 ;
 ; BDVER2-LABEL: v4f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step_2_divs:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
-; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
-; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
-; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   %div2 = fdiv fast <4 x float> %div, %x
   ret <4 x float> %div2
@@ -691,95 +614,83 @@ define <4 x float> @v4f32_two_step2(<4 x
 ;
 ; BDVER2-LABEL: v4f32_two_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_two_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v4f32_two_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v4f32_two_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
-; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v4f32_two_step2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %xmm0, %xmm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -826,70 +737,70 @@ define <8 x float> @v8f32_one_step2(<8 x
 ;
 ; BDVER2-LABEL: v8f32_one_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -940,77 +851,77 @@ define <8 x float> @v8f32_one_step_2_div
 ;
 ; BDVER2-LABEL: v8f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [10:2.00]
-; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step_2_divs:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
-; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
+; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   %div2 = fdiv fast <8 x float> %div, %x
   ret <8 x float> %div2
@@ -1078,95 +989,83 @@ define <8 x float> @v8f32_two_step2(<8 x
 ;
 ; BDVER2-LABEL: v8f32_two_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_two_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v8f32_two_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v8f32_two_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v8f32_two_step2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %ymm0, %ymm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %ymm1, %ymm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -1178,50 +1077,10 @@ define <8 x float> @v8f32_no_step(<8 x f
 ; SSE-NEXT:    rcpps %xmm1, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-RECIP-LABEL: v8f32_no_step:
-; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; AVX-RECIP-NEXT:    retq
-;
-; FMA-RECIP-LABEL: v8f32_no_step:
-; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; FMA-RECIP-NEXT:    retq
-;
-; BDVER2-LABEL: v8f32_no_step:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: v8f32_no_step:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; SANDY-LABEL: v8f32_no_step:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: v8f32_no_step:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; HASWELL-NO-FMA-LABEL: v8f32_no_step:
-; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v8f32_no_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v8f32_no_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX-LABEL: v8f32_no_step:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -1235,59 +1094,11 @@ define <8 x float> @v8f32_no_step2(<8 x
 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-RECIP-LABEL: v8f32_no_step2:
-; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-RECIP-NEXT:    retq
-;
-; FMA-RECIP-LABEL: v8f32_no_step2:
-; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
-; FMA-RECIP-NEXT:    retq
-;
-; BDVER2-LABEL: v8f32_no_step2:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: v8f32_no_step2:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; SANDY-LABEL: v8f32_no_step2:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: v8f32_no_step2:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
-; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v8f32_no_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v8f32_no_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; AVX-LABEL: v8f32_no_step2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %ymm0, %ymm0
+; AVX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -1361,96 +1172,88 @@ define <16 x float> @v16f32_one_step2(<1
 ;
 ; BDVER2-LABEL: v16f32_one_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm4
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm0, %ymm4, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm4
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm4, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vmulps %ymm0, %ymm4, %ymm0
+; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vaddps %ymm0, %ymm4, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_one_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm1, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vrcpps %ymm0, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_one_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm4 # sched: [11:2.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v16f32_one_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_one_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v16f32_one_step2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   ret <16 x float> %div
 }
@@ -1532,108 +1335,99 @@ define <16 x float> @v16f32_one_step_2_d
 ;
 ; BDVER2-LABEL: v16f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [10:2.00]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [10:2.00]
-; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
+; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0
+; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
+; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
+; BTVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_one_step_2_divs:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vrcpps %ymm1, %ymm4
+; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1
+; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm4, %ymm1
+; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
+; SANDY-NEXT:    vmulps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
+; HASWELL-NEXT:    vmulps %ymm0, %ymm3, %ymm0
+; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v16f32_one_step_2_divs:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
-; KNL-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_one_step_2_divs:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
-; SKX-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm4, %ymm1
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v16f32_one_step_2_divs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   %div2 = fdiv fast <16 x float> %div, %x
   ret <16 x float> %div2
@@ -1745,138 +1539,126 @@ define <16 x float> @v16f32_two_step2(<1
 ;
 ; BDVER2-LABEL: v16f32_two_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_two_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm1, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT:    vrcpps %ymm0, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_two_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
-; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
+; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v16f32_two_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
-; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
-; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
-; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_two_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
-; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v16f32_two_step2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
+; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2
+; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   ret <16 x float> %div
 }
@@ -1904,43 +1686,38 @@ define <16 x float> @v16f32_no_step(<16
 ;
 ; BDVER2-LABEL: v16f32_no_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_no_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_no_step:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0
+; SANDY-NEXT:    vrcpps %ymm1, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_no_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_no_step:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v16f32_no_step:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_no_step:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v16f32_no_step:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <16 x float> %div
 }
@@ -1976,55 +1753,49 @@ define <16 x float> @v16f32_no_step2(<16
 ;
 ; BDVER2-LABEL: v16f32_no_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_no_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
+; BTVER2-NEXT:    vrcpps %ymm1, %ymm1
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v16f32_no_step2:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; SANDY-NEXT:    retq # sched: [1:1.00]
+; SANDY-NEXT:    vrcpps %ymm1, %ymm1
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v16f32_no_step2:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NEXT:    vrcpps %ymm1, %ymm1
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
-; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: v16f32_no_step2:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
-; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: v16f32_no_step2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v16f32_no_step2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcp14ps %zmm0, %zmm0
+; AVX512-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   ret <16 x float> %div
 }

Removed: llvm/trunk/test/CodeGen/X86/rtm-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/rtm-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/rtm-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/rtm-schedule.ll (removed)
@@ -1,62 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=x86-64 -mattr=+rtm | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=cannonlake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=CNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=icelake-client | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=ICL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+rtm -mcpu=icelake-server | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=ICL
-
-define i32 @test_xbegin() nounwind uwtable {
-; GENERIC-LABEL: test_xbegin:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    xbegin .LBB0_2 # sched: [100:0.33]
-; GENERIC-NEXT:  # %bb.1:
-; GENERIC-NEXT:    movl $-1, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-; GENERIC-NEXT:  .LBB0_2:
-; GENERIC-NEXT:    # XABORT DEF # sched: [100:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKYLAKE-LABEL: test_xbegin:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    xbegin .LBB0_2 # sched: [100:0.25]
-; SKYLAKE-NEXT:  # %bb.1:
-; SKYLAKE-NEXT:    movl $-1, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-; SKYLAKE-NEXT:  .LBB0_2:
-; SKYLAKE-NEXT:    # XABORT DEF # sched: [100:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-  %1 = tail call i32 @llvm.x86.xbegin() nounwind
-  ret i32 %1
-}
-declare i32 @llvm.x86.xbegin() nounwind
-
-define void @test_xend() nounwind uwtable {
-; GENERIC-LABEL: test_xend:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    xend # sched: [100:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKYLAKE-LABEL: test_xend:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    xend # sched: [100:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-  tail call void @llvm.x86.xend() nounwind
-  ret void
-}
-declare void @llvm.x86.xend() nounwind
-
-define void @test_xabort() nounwind uwtable {
-; GENERIC-LABEL: test_xabort:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    xabort $2 # sched: [100:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKYLAKE-LABEL: test_xabort:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    xabort $2 # sched: [100:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-  tail call void @llvm.x86.xabort(i8 2)
-  ret void
-}
-declare void @llvm.x86.xabort(i8) nounwind

Removed: llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll (original)
+++ llvm/trunk/test/CodeGen/X86/schedule-x86-64-shld.ll (removed)
@@ -1,471 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-
-
-; uint64_t lshift10(uint64_t a, uint64_t b)
-; {
-;     return (a << 10) | (b >> 54);
-; }
-
-define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
-; GENERIC-LABEL: lshift10_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift10_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    shldq $10, %rsi, %rax # sched: [4:3.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift10_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    shldq $10, %rsi, %rax # sched: [3:3.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = shl i64 %a, 10
-  %shr = lshr i64 %b, 54
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-define i64 @lshift10(i64 %a, i64 %b) nounwind readnone {
-; GENERIC-LABEL: lshift10:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift10:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    shrq $54, %rsi # sched: [1:0.50]
-; BDVER12-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift10:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    shlq $10, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    shrq $54, %rsi # sched: [1:0.50]
-; BTVER2-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = shl i64 %a, 10
-  %shr = lshr i64 %b, 54
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-; uint64_t rshift10(uint64_t a, uint64_t b)
-; {
-;     return (a >> 62) | (b << 2);
-; }
-
-; Should be done via shld
-define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
-; GENERIC-LABEL: rshift10_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: rshift10_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    shrdq $62, %rsi, %rax # sched: [4:3.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: rshift10_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    shrdq $62, %rsi, %rax # sched: [3:3.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = lshr i64 %a, 62
-  %shr = shl i64 %b, 2
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-; Should be done via lea (x,y,4),z
-define i64 @rshift10(i64 %a, i64 %b) nounwind readnone {
-; GENERIC-LABEL: rshift10:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: rshift10:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    shrq $62, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: rshift10:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    shrq $62, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [2:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = lshr i64 %a, 62
-  %shr = shl i64 %b, 2
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-;uint64_t lshift(uint64_t a, uint64_t b, uint64_t c)
-;{
-;    return (a << c) | (b >> (64-c));
-;}
-
-define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize {
-; GENERIC-LABEL: lshift_cl_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_cl_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_cl_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = shl i64 %a, %c
-  %sub = sub nsw i64 64, %c
-  %shr = lshr i64 %b, %sub
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
-; GENERIC-LABEL: lshift_cl:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_cl:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrq %cl, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_cl:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shrq %cl, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shl = shl i64 %a, %c
-  %sub = sub nsw i64 64, %c
-  %shr = lshr i64 %b, %sub
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-
-;uint64_t rshift(uint64_t a, uint64_t b, int c)
-;{
-;    return (a >> c) | (b << (64-c));
-;}
-
-define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize {
-; GENERIC-LABEL: rshift_cl_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: rshift_cl_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: rshift_cl_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shr = lshr i64 %a, %c
-  %sub = sub nsw i64 64, %c
-  %shl = shl i64 %b, %sub
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
-; GENERIC-LABEL: rshift_cl:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: rshift_cl:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: rshift_cl:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %shr = lshr i64 %a, %c
-  %sub = sub nsw i64 64, %c
-  %shl = shl i64 %b, %sub
-  %or = or i64 %shr, %shl
-  ret i64 %or
-}
-
-; extern uint64_t x;
-;void lshift(uint64_t a, uint64_t b, uint_64_t c)
-;{
-;    x = (x << c) | (a >> (64-c));
-;}
- at x = global i64 0, align 4
-
-define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize {
-; GENERIC-LABEL: lshift_mem_cl_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem_cl_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [4:11.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem_cl_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %b = load i64, i64* @x
-  %shl = shl i64 %b, %c
-  %sub = sub nsw i64 64, %c
-  %shr = lshr i64 %a, %sub
-  %or = or i64 %shl, %shr
-  store i64 %or, i64* @x
-  ret void
-}
-
-define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
-; GENERIC-LABEL: lshift_mem_cl:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
-; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem_cl:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
-; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    negb %cl # sched: [1:0.50]
-; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem_cl:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [3:1.00]
-; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    negb %cl # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %b = load i64, i64* @x
-  %shl = shl i64 %b, %c
-  %sub = sub nsw i64 64, %c
-  %shr = lshr i64 %a, %sub
-  %or = or i64 %shl, %shr
-  store i64 %or, i64* @x
-  ret void
-}
-
-define void @lshift_mem(i64 %a) nounwind readnone {
-; GENERIC-LABEL: lshift_mem:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shrq $54, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [3:1.00]
-; BTVER2-NEXT:    shrq $54, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    shlq $10, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %b = load i64, i64* @x
-  %shl = shl i64 %b, 10
-  %shr = lshr i64 %a, 54
-  %or = or i64 %shr, %shl
-  store i64 %or, i64* @x
-  ret void
-}
-
-define void @lshift_mem_optsize(i64 %a) nounwind readnone optsize {
-; GENERIC-LABEL: lshift_mem_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [4:11.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [9:11.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %b = load i64, i64* @x
-  %shl = shl i64 %b, 10
-  %shr = lshr i64 %a, 54
-  %or = or i64 %shr, %shl
-  store i64 %or, i64* @x
-  ret void
-}
-
-define void @lshift_mem_b(i64 %b) nounwind readnone {
-; GENERIC-LABEL: lshift_mem_b:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; GENERIC-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem_b:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    shrq $54, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem_b:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [3:1.00]
-; BTVER2-NEXT:    shlq $10, %rdi # sched: [1:0.50]
-; BTVER2-NEXT:    shrq $54, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %a = load i64, i64* @x
-  %shl = shl i64 %b, 10
-  %shr = lshr i64 %a, 54
-  %or = or i64 %shr, %shl
-  store i64 %or, i64* @x
-  ret void
-}
-
-define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize {
-; GENERIC-LABEL: lshift_mem_b_optsize:
-; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; GENERIC-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
-; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; BDVER12-LABEL: lshift_mem_b_optsize:
-; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [4:3.00]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: lshift_mem_b_optsize:
-; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [3:1.00]
-; BTVER2-NEXT:    shrdq $54, %rdi, %rax # sched: [3:3.00]
-; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-entry:
-  %a = load i64, i64* @x
-  %shl = shl i64 %b, 10
-  %shr = lshr i64 %a, 54
-  %or = or i64 %shr, %shl
-  store i64 %or, i64* @x
-  ret void
-}
-

Removed: llvm/trunk/test/CodeGen/X86/schedule-x86_32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/schedule-x86_32.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/schedule-x86_32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/schedule-x86_32.ll (removed)
@@ -1,2601 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
-
-define i8 @test_aaa(i8 %a0) optsize {
-; GENERIC-LABEL: test_aaa:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movb {{[0-9]+}}(%esp), %al
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    aaa
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_aaa:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    aaa # sched: [13:6.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_aaa:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    aaa # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_aaa:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    aaa # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_aaa:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    aaa # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_aaa:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    aaa # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_aaa:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    aaa # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_aaa:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    aaa # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_aaa:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aaa # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_aaa:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    aaa # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_aaa:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    aaa # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind
-  ret i8 %1
-}
-
-define void @test_aad(i16 %a0) optsize {
-; GENERIC-LABEL: test_aad:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    aad
-; GENERIC-NEXT:    aad $16
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_aad:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    aad # sched: [7:3.50]
-; ATOM-NEXT:    aad $16 # sched: [7:3.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_aad:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    aad # sched: [100:1.00]
-; SLM-NEXT:    aad $16 # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_aad:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    aad # sched: [100:0.33]
-; SANDY-NEXT:    aad $16 # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_aad:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    aad # sched: [100:0.25]
-; HASWELL-NEXT:    aad $16 # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_aad:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    aad # sched: [100:0.25]
-; BROADWELL-NEXT:    aad $16 # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_aad:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    aad # sched: [100:0.25]
-; SKYLAKE-NEXT:    aad $16 # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_aad:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    aad # sched: [100:0.25]
-; SKX-NEXT:    aad $16 # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_aad:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aad # sched: [100:0.50]
-; BDVER2-NEXT:    aad $16 # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_aad:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    aad # sched: [100:0.50]
-; BTVER2-NEXT:    aad $16 # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_aad:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    aad # sched: [100:0.25]
-; ZNVER1-NEXT:    aad $16 # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "aad \0A\09 aad $1", "r,i"(i16 %a0, i16 16) nounwind
-  ret void
-}
-
-define void @test_aam(i8 %a0) optsize {
-; GENERIC-LABEL: test_aam:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movb {{[0-9]+}}(%esp), %al
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    aam
-; GENERIC-NEXT:    aam $16
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_aam:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    aam # sched: [21:10.50]
-; ATOM-NEXT:    aam $16 # sched: [21:10.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_aam:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    aam # sched: [100:1.00]
-; SLM-NEXT:    aam $16 # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_aam:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    aam # sched: [100:0.33]
-; SANDY-NEXT:    aam $16 # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_aam:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    aam # sched: [100:0.25]
-; HASWELL-NEXT:    aam $16 # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_aam:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    aam # sched: [100:0.25]
-; BROADWELL-NEXT:    aam $16 # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_aam:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    aam # sched: [100:0.25]
-; SKYLAKE-NEXT:    aam $16 # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_aam:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    aam # sched: [100:0.25]
-; SKX-NEXT:    aam $16 # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_aam:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aam # sched: [100:0.50]
-; BDVER2-NEXT:    aam $16 # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_aam:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    aam # sched: [100:0.50]
-; BTVER2-NEXT:    aam $16 # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_aam:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    aam # sched: [100:0.25]
-; ZNVER1-NEXT:    aam $16 # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "aam \0A\09 aam $1", "r,i"(i8 %a0, i8 16) nounwind
-  ret void
-}
-
-define i8 @test_aas(i8 %a0) optsize {
-; GENERIC-LABEL: test_aas:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movb {{[0-9]+}}(%esp), %al
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    aas
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_aas:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    aas # sched: [13:6.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_aas:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    aas # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_aas:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    aas # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_aas:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    aas # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_aas:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    aas # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_aas:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    aas # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_aas:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    aas # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_aas:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aas # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_aas:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    aas # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_aas:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    aas # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind
-  ret i8 %1
-}
-
-define void @test_arpl(i16 %a0, i16 *%a1) optsize {
-; GENERIC-LABEL: test_arpl:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    arpl %ax, (%ecx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_arpl:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    arpl %ax, (%ecx) # sched: [23:11.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_arpl:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    arpl %ax, (%ecx) # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_arpl:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    arpl %ax, (%ecx) # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_arpl:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_arpl:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_arpl:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_arpl:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_arpl:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_arpl:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_arpl:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "arpl $0, $1", "r,*m"(i16 %a0, i16 *%a1)
-  ret void
-}
-
-define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
-; GENERIC-LABEL: test_bound:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    pushl %esi
-; GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; GENERIC-NEXT:    .cfi_offset %esi, -8
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    bound %ax, (%esi)
-; GENERIC-NEXT:    bound %ecx, (%edx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    popl %esi
-; GENERIC-NEXT:    .cfi_def_cfa_offset 4
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_bound:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    pushl %esi # sched: [1:1.00]
-; ATOM-NEXT:    .cfi_def_cfa_offset 8
-; ATOM-NEXT:    .cfi_offset %esi, -8
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    bound %ax, (%esi) # sched: [11:5.50]
-; ATOM-NEXT:    bound %ecx, (%edx) # sched: [11:5.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    popl %esi # sched: [1:1.00]
-; ATOM-NEXT:    .cfi_def_cfa_offset 4
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_bound:
-; SLM:       # %bb.0:
-; SLM-NEXT:    pushl %esi # sched: [1:1.00]
-; SLM-NEXT:    .cfi_def_cfa_offset 8
-; SLM-NEXT:    .cfi_offset %esi, -8
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    bound %ax, (%esi) # sched: [100:1.00]
-; SLM-NEXT:    bound %ecx, (%edx) # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    popl %esi # sched: [3:1.00]
-; SLM-NEXT:    .cfi_def_cfa_offset 4
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_bound:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    pushl %esi # sched: [5:1.00]
-; SANDY-NEXT:    .cfi_def_cfa_offset 8
-; SANDY-NEXT:    .cfi_offset %esi, -8
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    bound %ax, (%esi) # sched: [100:0.33]
-; SANDY-NEXT:    bound %ecx, (%edx) # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    popl %esi # sched: [6:0.50]
-; SANDY-NEXT:    .cfi_def_cfa_offset 4
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_bound:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    pushl %esi # sched: [2:1.00]
-; HASWELL-NEXT:    .cfi_def_cfa_offset 8
-; HASWELL-NEXT:    .cfi_offset %esi, -8
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    bound %ax, (%esi) # sched: [1:3.75]
-; HASWELL-NEXT:    bound %ecx, (%edx) # sched: [1:3.75]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    popl %esi # sched: [6:0.50]
-; HASWELL-NEXT:    .cfi_def_cfa_offset 4
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_bound:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    pushl %esi # sched: [2:1.00]
-; BROADWELL-NEXT:    .cfi_def_cfa_offset 8
-; BROADWELL-NEXT:    .cfi_offset %esi, -8
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
-; BROADWELL-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    popl %esi # sched: [6:0.50]
-; BROADWELL-NEXT:    .cfi_def_cfa_offset 4
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_bound:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    pushl %esi # sched: [2:1.00]
-; SKYLAKE-NEXT:    .cfi_def_cfa_offset 8
-; SKYLAKE-NEXT:    .cfi_offset %esi, -8
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
-; SKYLAKE-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    popl %esi # sched: [6:0.50]
-; SKYLAKE-NEXT:    .cfi_def_cfa_offset 4
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_bound:
-; SKX:       # %bb.0:
-; SKX-NEXT:    pushl %esi # sched: [2:1.00]
-; SKX-NEXT:    .cfi_def_cfa_offset 8
-; SKX-NEXT:    .cfi_offset %esi, -8
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
-; SKX-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    popl %esi # sched: [6:0.50]
-; SKX-NEXT:    .cfi_def_cfa_offset 4
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_bound:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pushl %esi # sched: [1:1.00]
-; BDVER2-NEXT:    .cfi_def_cfa_offset 8
-; BDVER2-NEXT:    .cfi_offset %esi, -8
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.50]
-; BDVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    popl %esi # sched: [5:0.50]
-; BDVER2-NEXT:    .cfi_def_cfa_offset 4
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_bound:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    pushl %esi # sched: [1:1.00]
-; BTVER2-NEXT:    .cfi_def_cfa_offset 8
-; BTVER2-NEXT:    .cfi_offset %esi, -8
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.50]
-; BTVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    popl %esi # sched: [3:1.00]
-; BTVER2-NEXT:    .cfi_def_cfa_offset 4
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_bound:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    pushl %esi # sched: [1:0.50]
-; ZNVER1-NEXT:    .cfi_def_cfa_offset 8
-; ZNVER1-NEXT:    .cfi_offset %esi, -8
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
-; ZNVER1-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    popl %esi # sched: [8:0.50]
-; ZNVER1-NEXT:    .cfi_def_cfa_offset 4
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "bound $0, $1 \0A\09 bound $2, $3", "r,*m,r,*m"(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3)
-  ret void
-}
-
-; TODO - test_call
-
-define i8 @test_daa(i8 %a0) optsize {
-; GENERIC-LABEL: test_daa:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movb {{[0-9]+}}(%esp), %al
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    daa
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_daa:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    daa # sched: [18:9.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_daa:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    daa # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_daa:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    daa # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_daa:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    daa # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_daa:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    daa # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_daa:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    daa # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_daa:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    daa # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_daa:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    daa # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_daa:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    daa # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_daa:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    daa # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = tail call i8 asm "daa", "=r,r"(i8 %a0) nounwind
-  ret i8 %1
-}
-
-define i8 @test_das(i8 %a0) optsize {
-; GENERIC-LABEL: test_das:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movb {{[0-9]+}}(%esp), %al
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    das
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_das:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    das # sched: [20:10.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_das:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    das # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_das:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    das # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_das:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    das # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_das:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    das # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_das:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    das # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_das:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    das # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_das:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    das # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_das:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    das # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_das:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    das # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = tail call i8 asm "das", "=r,r"(i8 %a0) nounwind
-  ret i8 %1
-}
-
-define void @test_dec16(i16 %a0, i16* %a1) optsize {
-; GENERIC-LABEL: test_dec16:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    decw %ax
-; GENERIC-NEXT:    decw (%ecx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_dec16:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    decw %ax # sched: [1:0.50]
-; ATOM-NEXT:    decw (%ecx) # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_dec16:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    decw %ax # sched: [1:0.50]
-; SLM-NEXT:    decw (%ecx) # sched: [5:2.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_dec16:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    decw %ax # sched: [1:0.33]
-; SANDY-NEXT:    decw (%ecx) # sched: [7:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_dec16:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    decw %ax # sched: [1:0.25]
-; HASWELL-NEXT:    decw (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_dec16:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    decw %ax # sched: [1:0.25]
-; BROADWELL-NEXT:    decw (%ecx) # sched: [7:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_dec16:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    decw %ax # sched: [1:0.25]
-; SKYLAKE-NEXT:    decw (%ecx) # sched: [7:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_dec16:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    decw %ax # sched: [1:0.25]
-; SKX-NEXT:    decw (%ecx) # sched: [7:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_dec16:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decw %ax # sched: [1:0.50]
-; BDVER2-NEXT:    decw (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_dec16:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    decw %ax # sched: [1:0.50]
-; BTVER2-NEXT:    decw (%ecx) # sched: [5:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_dec16:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    decw %ax # sched: [1:0.25]
-; ZNVER1-NEXT:    decw (%ecx) # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "decw $0 \0A\09 decw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
-  ret void
-}
-define void @test_dec32(i32 %a0, i32* %a1) optsize {
-; GENERIC-LABEL: test_dec32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    decl %eax
-; GENERIC-NEXT:    decl (%ecx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_dec32:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    decl %eax # sched: [1:0.50]
-; ATOM-NEXT:    decl (%ecx) # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_dec32:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    decl %eax # sched: [1:0.50]
-; SLM-NEXT:    decl (%ecx) # sched: [5:2.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_dec32:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    decl %eax # sched: [1:0.33]
-; SANDY-NEXT:    decl (%ecx) # sched: [7:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_dec32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    decl %eax # sched: [1:0.25]
-; HASWELL-NEXT:    decl (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_dec32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    decl %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    decl (%ecx) # sched: [7:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_dec32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    decl %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    decl (%ecx) # sched: [7:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_dec32:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    decl %eax # sched: [1:0.25]
-; SKX-NEXT:    decl (%ecx) # sched: [7:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_dec32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decl %eax # sched: [1:0.50]
-; BDVER2-NEXT:    decl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_dec32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    decl %eax # sched: [1:0.50]
-; BTVER2-NEXT:    decl (%ecx) # sched: [5:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_dec32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    decl %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    decl (%ecx) # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "decl $0 \0A\09 decl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
-  ret void
-}
-
-define void @test_inc16(i16 %a0, i16* %a1) optsize {
-; GENERIC-LABEL: test_inc16:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    incw %ax
-; GENERIC-NEXT:    incw (%ecx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_inc16:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    incw %ax # sched: [1:0.50]
-; ATOM-NEXT:    incw (%ecx) # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_inc16:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    incw %ax # sched: [1:0.50]
-; SLM-NEXT:    incw (%ecx) # sched: [5:2.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_inc16:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    incw %ax # sched: [1:0.33]
-; SANDY-NEXT:    incw (%ecx) # sched: [7:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_inc16:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    incw %ax # sched: [1:0.25]
-; HASWELL-NEXT:    incw (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_inc16:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    incw %ax # sched: [1:0.25]
-; BROADWELL-NEXT:    incw (%ecx) # sched: [7:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_inc16:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    incw %ax # sched: [1:0.25]
-; SKYLAKE-NEXT:    incw (%ecx) # sched: [7:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_inc16:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    incw %ax # sched: [1:0.25]
-; SKX-NEXT:    incw (%ecx) # sched: [7:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_inc16:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incw %ax # sched: [1:0.50]
-; BDVER2-NEXT:    incw (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_inc16:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    incw %ax # sched: [1:0.50]
-; BTVER2-NEXT:    incw (%ecx) # sched: [5:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_inc16:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    incw %ax # sched: [1:0.25]
-; ZNVER1-NEXT:    incw (%ecx) # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "incw $0 \0A\09 incw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
-  ret void
-}
-define void @test_inc32(i32 %a0, i32* %a1) optsize {
-; GENERIC-LABEL: test_inc32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    incl %eax
-; GENERIC-NEXT:    incl (%ecx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_inc32:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    incl %eax # sched: [1:0.50]
-; ATOM-NEXT:    incl (%ecx) # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_inc32:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    incl %eax # sched: [1:0.50]
-; SLM-NEXT:    incl (%ecx) # sched: [5:2.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_inc32:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    incl %eax # sched: [1:0.33]
-; SANDY-NEXT:    incl (%ecx) # sched: [7:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_inc32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    incl %eax # sched: [1:0.25]
-; HASWELL-NEXT:    incl (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_inc32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    incl %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    incl (%ecx) # sched: [7:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_inc32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    incl %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    incl (%ecx) # sched: [7:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_inc32:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    incl %eax # sched: [1:0.25]
-; SKX-NEXT:    incl (%ecx) # sched: [7:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_inc32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incl %eax # sched: [1:0.50]
-; BDVER2-NEXT:    incl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_inc32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    incl %eax # sched: [1:0.50]
-; BTVER2-NEXT:    incl (%ecx) # sched: [5:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_inc32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    incl %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    incl (%ecx) # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "incl $0 \0A\09 incl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
-  ret void
-}
-
-define void @test_into() optsize {
-; GENERIC-LABEL: test_into:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    into
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_into:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    into # sched: [6:3.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_into:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    into # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_into:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    into # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_into:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    into # sched: [1:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_into:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    into # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_into:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    into # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_into:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    into # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_into:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    into # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_into:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    into # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_into:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    into # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "into", ""()
-  ret void
-}
-
-; TODO - test_jmp
-
-define void @test_jcxz_jecxz() optsize {
-; GENERIC-LABEL: test_jcxz_jecxz:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:  JXTGT:
-; GENERIC-NEXT:    jcxz JXTGT
-; GENERIC-NEXT:    jecxz JXTGT
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_jcxz_jecxz:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:  JXTGT:
-; ATOM-NEXT:    jcxz JXTGT # sched: [4:2.00]
-; ATOM-NEXT:    jecxz JXTGT # sched: [4:2.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_jcxz_jecxz:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:  JXTGT:
-; SLM-NEXT:    jcxz JXTGT # sched: [1:1.00]
-; SLM-NEXT:    jecxz JXTGT # sched: [1:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_jcxz_jecxz:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:  JXTGT:
-; SANDY-NEXT:    jcxz JXTGT # sched: [2:1.00]
-; SANDY-NEXT:    jecxz JXTGT # sched: [2:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_jcxz_jecxz:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:  JXTGT:
-; HASWELL-NEXT:    jcxz JXTGT # sched: [2:0.50]
-; HASWELL-NEXT:    jecxz JXTGT # sched: [2:0.50]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_jcxz_jecxz:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:  JXTGT:
-; BROADWELL-NEXT:    jcxz JXTGT # sched: [2:0.50]
-; BROADWELL-NEXT:    jecxz JXTGT # sched: [2:0.50]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_jcxz_jecxz:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:  JXTGT:
-; SKYLAKE-NEXT:    jcxz JXTGT # sched: [2:0.50]
-; SKYLAKE-NEXT:    jecxz JXTGT # sched: [2:0.50]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_jcxz_jecxz:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:  JXTGT:
-; SKX-NEXT:    jcxz JXTGT # sched: [2:0.50]
-; SKX-NEXT:    jecxz JXTGT # sched: [2:0.50]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_jcxz_jecxz:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:  JXTGT:
-; BDVER2-NEXT:    jcxz JXTGT # sched: [1:1.00]
-; BDVER2-NEXT:    jecxz JXTGT # sched: [1:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_jcxz_jecxz:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:  JXTGT:
-; BTVER2-NEXT:    jcxz JXTGT # sched: [1:0.50]
-; BTVER2-NEXT:    jecxz JXTGT # sched: [1:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_jcxz_jecxz:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:  JXTGT:
-; ZNVER1-NEXT:    jcxz JXTGT # sched: [1:0.50]
-; ZNVER1-NEXT:    jecxz JXTGT # sched: [1:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "JXTGT: \0A\09 jcxz JXTGT \0A\09 jecxz JXTGT", ""()
-  ret void
-}
-
-; TODO - test_lds
-
-define void @test_leave() optsize {
-; GENERIC-LABEL: test_leave:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    leave
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_leave:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    leave # sched: [2:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_leave:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    leave # sched: [1:0.50]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_leave:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    leave # sched: [7:0.67]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_leave:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    leave # sched: [7:0.50]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_leave:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    leave # sched: [7:0.50]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_leave:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    leave # sched: [7:0.50]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_leave:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    leave # sched: [7:0.50]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_leave:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    leave # sched: [1:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_leave:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    leave # sched: [1:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_leave:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    leave # sched: [8:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "leave", ""() nounwind
-  ret void
-}
-
-; TODO - test_les
-
-define void @test_pop_push() optsize {
-; GENERIC-LABEL: test_pop_push:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    popl %ds
-; GENERIC-NEXT:    popl %es
-; GENERIC-NEXT:    popl %ss
-; GENERIC-NEXT:    popl %fs
-; GENERIC-NEXT:    popl %gs
-; GENERIC-NEXT:    pushl %cs
-; GENERIC-NEXT:    pushl %ds
-; GENERIC-NEXT:    pushl %es
-; GENERIC-NEXT:    pushl %ss
-; GENERIC-NEXT:    pushl %fs
-; GENERIC-NEXT:    pushl %gs
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_pop_push:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    popl %ds # sched: [29:14.50]
-; ATOM-NEXT:    popl %es # sched: [29:14.50]
-; ATOM-NEXT:    popl %ss # sched: [48:24.00]
-; ATOM-NEXT:    popl %fs # sched: [29:14.50]
-; ATOM-NEXT:    popl %gs # sched: [29:14.50]
-; ATOM-NEXT:    pushl %cs # sched: [2:1.00]
-; ATOM-NEXT:    pushl %ds # sched: [2:1.00]
-; ATOM-NEXT:    pushl %es # sched: [2:1.00]
-; ATOM-NEXT:    pushl %ss # sched: [2:1.00]
-; ATOM-NEXT:    pushl %fs # sched: [2:1.00]
-; ATOM-NEXT:    pushl %gs # sched: [2:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_pop_push:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    popl %ds # sched: [100:1.00]
-; SLM-NEXT:    popl %es # sched: [100:1.00]
-; SLM-NEXT:    popl %ss # sched: [100:1.00]
-; SLM-NEXT:    popl %fs # sched: [100:1.00]
-; SLM-NEXT:    popl %gs # sched: [100:1.00]
-; SLM-NEXT:    pushl %cs # sched: [100:1.00]
-; SLM-NEXT:    pushl %ds # sched: [100:1.00]
-; SLM-NEXT:    pushl %es # sched: [100:1.00]
-; SLM-NEXT:    pushl %ss # sched: [100:1.00]
-; SLM-NEXT:    pushl %fs # sched: [100:1.00]
-; SLM-NEXT:    pushl %gs # sched: [100:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_pop_push:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    popl %ds # sched: [100:0.33]
-; SANDY-NEXT:    popl %es # sched: [100:0.33]
-; SANDY-NEXT:    popl %ss # sched: [100:0.33]
-; SANDY-NEXT:    popl %fs # sched: [100:0.33]
-; SANDY-NEXT:    popl %gs # sched: [100:0.33]
-; SANDY-NEXT:    pushl %cs # sched: [100:0.33]
-; SANDY-NEXT:    pushl %ds # sched: [100:0.33]
-; SANDY-NEXT:    pushl %es # sched: [100:0.33]
-; SANDY-NEXT:    pushl %ss # sched: [100:0.33]
-; SANDY-NEXT:    pushl %fs # sched: [100:0.33]
-; SANDY-NEXT:    pushl %gs # sched: [100:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_pop_push:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    popl %ds # sched: [100:0.25]
-; HASWELL-NEXT:    popl %es # sched: [100:0.25]
-; HASWELL-NEXT:    popl %ss # sched: [100:0.25]
-; HASWELL-NEXT:    popl %fs # sched: [100:0.25]
-; HASWELL-NEXT:    popl %gs # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %cs # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %ds # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %es # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %ss # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %fs # sched: [100:0.25]
-; HASWELL-NEXT:    pushl %gs # sched: [100:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pop_push:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    popl %ds # sched: [100:0.25]
-; BROADWELL-NEXT:    popl %es # sched: [100:0.25]
-; BROADWELL-NEXT:    popl %ss # sched: [100:0.25]
-; BROADWELL-NEXT:    popl %fs # sched: [100:0.25]
-; BROADWELL-NEXT:    popl %gs # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %cs # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %ds # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %es # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %ss # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %fs # sched: [100:0.25]
-; BROADWELL-NEXT:    pushl %gs # sched: [100:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_pop_push:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    popl %ds # sched: [100:0.25]
-; SKYLAKE-NEXT:    popl %es # sched: [100:0.25]
-; SKYLAKE-NEXT:    popl %ss # sched: [100:0.25]
-; SKYLAKE-NEXT:    popl %fs # sched: [100:0.25]
-; SKYLAKE-NEXT:    popl %gs # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %cs # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %ds # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %es # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %ss # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %fs # sched: [100:0.25]
-; SKYLAKE-NEXT:    pushl %gs # sched: [100:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_pop_push:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    popl %ds # sched: [100:0.25]
-; SKX-NEXT:    popl %es # sched: [100:0.25]
-; SKX-NEXT:    popl %ss # sched: [100:0.25]
-; SKX-NEXT:    popl %fs # sched: [100:0.25]
-; SKX-NEXT:    popl %gs # sched: [100:0.25]
-; SKX-NEXT:    pushl %cs # sched: [100:0.25]
-; SKX-NEXT:    pushl %ds # sched: [100:0.25]
-; SKX-NEXT:    pushl %es # sched: [100:0.25]
-; SKX-NEXT:    pushl %ss # sched: [100:0.25]
-; SKX-NEXT:    pushl %fs # sched: [100:0.25]
-; SKX-NEXT:    pushl %gs # sched: [100:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_pop_push:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popl %ds # sched: [100:0.50]
-; BDVER2-NEXT:    popl %es # sched: [100:0.50]
-; BDVER2-NEXT:    popl %ss # sched: [100:0.50]
-; BDVER2-NEXT:    popl %fs # sched: [100:0.50]
-; BDVER2-NEXT:    popl %gs # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %cs # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %ds # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %es # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %ss # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %fs # sched: [100:0.50]
-; BDVER2-NEXT:    pushl %gs # sched: [100:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_pop_push:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    popl %ds # sched: [100:0.50]
-; BTVER2-NEXT:    popl %es # sched: [100:0.50]
-; BTVER2-NEXT:    popl %ss # sched: [100:0.50]
-; BTVER2-NEXT:    popl %fs # sched: [100:0.50]
-; BTVER2-NEXT:    popl %gs # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %cs # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %ds # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %es # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %ss # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %fs # sched: [100:0.50]
-; BTVER2-NEXT:    pushl %gs # sched: [100:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_pop_push:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    popl %ds # sched: [100:0.25]
-; ZNVER1-NEXT:    popl %es # sched: [100:0.25]
-; ZNVER1-NEXT:    popl %ss # sched: [100:0.25]
-; ZNVER1-NEXT:    popl %fs # sched: [100:0.25]
-; ZNVER1-NEXT:    popl %gs # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %cs # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %ds # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %es # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %ss # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %fs # sched: [100:0.25]
-; ZNVER1-NEXT:    pushl %gs # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "pop %DS \0A\09 pop %ES \0A\09 pop %SS \0A\09 pop %FS \0A\09 pop %GS \0A\09 push %CS \0A\09 push %DS \0A\09 push %ES \0A\09 push %SS \0A\09 push %FS \0A\09 push %GS", ""()
-  ret void
-}
-define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
-; GENERIC-LABEL: test_pop_push_16:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    popw %ax
-; GENERIC-NEXT:    popw (%ecx)
-; GENERIC-NEXT:    pushw %ax
-; GENERIC-NEXT:    pushw (%ecx)
-; GENERIC-NEXT:    pushw $4095 # imm = 0xFFF
-; GENERIC-NEXT:    pushw $7
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_pop_push_16:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    popw %ax # sched: [2:1.00]
-; ATOM-NEXT:    popw (%ecx) # sched: [3:1.50]
-; ATOM-NEXT:    pushw %ax # sched: [1:1.00]
-; ATOM-NEXT:    pushw (%ecx) # sched: [2:1.00]
-; ATOM-NEXT:    pushw $4095 # imm = 0xFFF
-; ATOM-NEXT:    # sched: [1:1.00]
-; ATOM-NEXT:    pushw $7 # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_pop_push_16:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    popw %ax # sched: [3:1.00]
-; SLM-NEXT:    popw (%ecx) # sched: [4:2.00]
-; SLM-NEXT:    pushw %ax # sched: [1:1.00]
-; SLM-NEXT:    pushw (%ecx) # sched: [4:2.00]
-; SLM-NEXT:    pushw $4095 # imm = 0xFFF
-; SLM-NEXT:    # sched: [1:1.00]
-; SLM-NEXT:    pushw $7 # sched: [1:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_pop_push_16:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    popw %ax # sched: [6:0.50]
-; SANDY-NEXT:    popw (%ecx) # sched: [6:0.50]
-; SANDY-NEXT:    pushw %ax # sched: [5:1.00]
-; SANDY-NEXT:    pushw (%ecx) # sched: [5:1.00]
-; SANDY-NEXT:    pushw $4095 # imm = 0xFFF
-; SANDY-NEXT:    # sched: [1:1.00]
-; SANDY-NEXT:    pushw $7 # sched: [1:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_pop_push_16:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    popw %ax # sched: [6:0.50]
-; HASWELL-NEXT:    popw (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    pushw %ax # sched: [2:1.00]
-; HASWELL-NEXT:    pushw (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    pushw $4095 # imm = 0xFFF
-; HASWELL-NEXT:    # sched: [1:1.00]
-; HASWELL-NEXT:    pushw $7 # sched: [1:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pop_push_16:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    popw %ax # sched: [6:0.50]
-; BROADWELL-NEXT:    popw (%ecx) # sched: [6:1.00]
-; BROADWELL-NEXT:    pushw %ax # sched: [2:1.00]
-; BROADWELL-NEXT:    pushw (%ecx) # sched: [6:1.00]
-; BROADWELL-NEXT:    pushw $4095 # imm = 0xFFF
-; BROADWELL-NEXT:    # sched: [1:1.00]
-; BROADWELL-NEXT:    pushw $7 # sched: [1:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_pop_push_16:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    popw %ax # sched: [6:0.50]
-; SKYLAKE-NEXT:    popw (%ecx) # sched: [6:1.00]
-; SKYLAKE-NEXT:    pushw %ax # sched: [2:1.00]
-; SKYLAKE-NEXT:    pushw (%ecx) # sched: [6:1.00]
-; SKYLAKE-NEXT:    pushw $4095 # imm = 0xFFF
-; SKYLAKE-NEXT:    # sched: [1:1.00]
-; SKYLAKE-NEXT:    pushw $7 # sched: [1:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_pop_push_16:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    popw %ax # sched: [6:0.50]
-; SKX-NEXT:    popw (%ecx) # sched: [6:1.00]
-; SKX-NEXT:    pushw %ax # sched: [2:1.00]
-; SKX-NEXT:    pushw (%ecx) # sched: [6:1.00]
-; SKX-NEXT:    pushw $4095 # imm = 0xFFF
-; SKX-NEXT:    # sched: [1:1.00]
-; SKX-NEXT:    pushw $7 # sched: [1:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_pop_push_16:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
-; BDVER2-NEXT:    popw (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushw %ax # sched: [1:1.00]
-; BDVER2-NEXT:    pushw (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_pop_push_16:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    popw %ax # sched: [3:1.00]
-; BTVER2-NEXT:    popw (%ecx) # sched: [4:1.00]
-; BTVER2-NEXT:    pushw %ax # sched: [1:1.00]
-; BTVER2-NEXT:    pushw (%ecx) # sched: [4:1.00]
-; BTVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BTVER2-NEXT:    # sched: [1:1.00]
-; BTVER2-NEXT:    pushw $7 # sched: [1:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_pop_push_16:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    popw %ax # sched: [8:0.50]
-; ZNVER1-NEXT:    popw (%ecx) # sched: [5:0.50]
-; ZNVER1-NEXT:    pushw %ax # sched: [1:0.50]
-; ZNVER1-NEXT:    pushw (%ecx) # sched: [4:0.50]
-; ZNVER1-NEXT:    pushw $4095 # imm = 0xFFF
-; ZNVER1-NEXT:    # sched: [1:0.50]
-; ZNVER1-NEXT:    pushw $7 # sched: [1:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = call i16 asm sideeffect "popw $0 \0A\09 popw $2 \0A\09 pushw $1 \0A\09 pushw $2 \0A\09 pushw $3 \0A\09 pushw $4", "=r,r,*m,i,i"(i16 %a0, i16 *%a1, i16 4095, i8 7)
-  ret i16 %1
-}
-define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
-; GENERIC-LABEL: test_pop_push_32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    popl %eax
-; GENERIC-NEXT:    popl (%ecx)
-; GENERIC-NEXT:    pushl %eax
-; GENERIC-NEXT:    pushl (%ecx)
-; GENERIC-NEXT:    pushl $4095 # imm = 0xFFF
-; GENERIC-NEXT:    pushl $7
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_pop_push_32:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    popl %eax # sched: [1:1.00]
-; ATOM-NEXT:    popl (%ecx) # sched: [3:1.50]
-; ATOM-NEXT:    pushl %eax # sched: [1:1.00]
-; ATOM-NEXT:    pushl (%ecx) # sched: [2:1.00]
-; ATOM-NEXT:    pushl $4095 # imm = 0xFFF
-; ATOM-NEXT:    # sched: [1:1.00]
-; ATOM-NEXT:    pushl $7 # sched: [1:1.00]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_pop_push_32:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    popl %eax # sched: [3:1.00]
-; SLM-NEXT:    popl (%ecx) # sched: [4:2.00]
-; SLM-NEXT:    pushl %eax # sched: [1:1.00]
-; SLM-NEXT:    pushl (%ecx) # sched: [4:2.00]
-; SLM-NEXT:    pushl $4095 # imm = 0xFFF
-; SLM-NEXT:    # sched: [1:1.00]
-; SLM-NEXT:    pushl $7 # sched: [1:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_pop_push_32:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    popl %eax # sched: [6:0.50]
-; SANDY-NEXT:    popl (%ecx) # sched: [6:0.50]
-; SANDY-NEXT:    pushl %eax # sched: [5:1.00]
-; SANDY-NEXT:    pushl (%ecx) # sched: [5:1.00]
-; SANDY-NEXT:    pushl $4095 # imm = 0xFFF
-; SANDY-NEXT:    # sched: [1:1.00]
-; SANDY-NEXT:    pushl $7 # sched: [1:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_pop_push_32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    popl %eax # sched: [6:0.50]
-; HASWELL-NEXT:    popl (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    pushl %eax # sched: [2:1.00]
-; HASWELL-NEXT:    pushl (%ecx) # sched: [7:1.00]
-; HASWELL-NEXT:    pushl $4095 # imm = 0xFFF
-; HASWELL-NEXT:    # sched: [1:1.00]
-; HASWELL-NEXT:    pushl $7 # sched: [1:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pop_push_32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    popl %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    popl (%ecx) # sched: [6:1.00]
-; BROADWELL-NEXT:    pushl %eax # sched: [2:1.00]
-; BROADWELL-NEXT:    pushl (%ecx) # sched: [6:1.00]
-; BROADWELL-NEXT:    pushl $4095 # imm = 0xFFF
-; BROADWELL-NEXT:    # sched: [1:1.00]
-; BROADWELL-NEXT:    pushl $7 # sched: [1:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_pop_push_32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    popl %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    popl (%ecx) # sched: [6:1.00]
-; SKYLAKE-NEXT:    pushl %eax # sched: [2:1.00]
-; SKYLAKE-NEXT:    pushl (%ecx) # sched: [6:1.00]
-; SKYLAKE-NEXT:    pushl $4095 # imm = 0xFFF
-; SKYLAKE-NEXT:    # sched: [1:1.00]
-; SKYLAKE-NEXT:    pushl $7 # sched: [1:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_pop_push_32:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    popl %eax # sched: [6:0.50]
-; SKX-NEXT:    popl (%ecx) # sched: [6:1.00]
-; SKX-NEXT:    pushl %eax # sched: [2:1.00]
-; SKX-NEXT:    pushl (%ecx) # sched: [6:1.00]
-; SKX-NEXT:    pushl $4095 # imm = 0xFFF
-; SKX-NEXT:    # sched: [1:1.00]
-; SKX-NEXT:    pushl $7 # sched: [1:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_pop_push_32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popl %eax # sched: [5:0.50]
-; BDVER2-NEXT:    popl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushl %eax # sched: [1:1.00]
-; BDVER2-NEXT:    pushl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    pushl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushl $7 # sched: [1:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_pop_push_32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    popl %eax # sched: [3:1.00]
-; BTVER2-NEXT:    popl (%ecx) # sched: [4:1.00]
-; BTVER2-NEXT:    pushl %eax # sched: [1:1.00]
-; BTVER2-NEXT:    pushl (%ecx) # sched: [4:1.00]
-; BTVER2-NEXT:    pushl $4095 # imm = 0xFFF
-; BTVER2-NEXT:    # sched: [1:1.00]
-; BTVER2-NEXT:    pushl $7 # sched: [1:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_pop_push_32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    popl %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    popl (%ecx) # sched: [9:1.00]
-; ZNVER1-NEXT:    pushl %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    pushl (%ecx) # sched: [4:0.50]
-; ZNVER1-NEXT:    pushl $4095 # imm = 0xFFF
-; ZNVER1-NEXT:    # sched: [1:0.50]
-; ZNVER1-NEXT:    pushl $7 # sched: [1:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = call i32 asm sideeffect "popl $0 \0A\09 popl $2 \0A\09 pushl $1 \0A\09 pushl $2 \0A\09 pushl $3 \0A\09 pushl $4", "=r,r,*m,i,i"(i32 %a0, i32 *%a1, i32 4095, i8 7)
-  ret i32 %1
-}
-
-define void @test_popa_popf_pusha_pushf() optsize {
-; GENERIC-LABEL: test_popa_popf_pusha_pushf:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    popal
-; GENERIC-NEXT:    popfl
-; GENERIC-NEXT:    pushal
-; GENERIC-NEXT:    pushfl
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_popa_popf_pusha_pushf:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    popal # sched: [9:4.50]
-; ATOM-NEXT:    popfl # sched: [26:13.00]
-; ATOM-NEXT:    pushal # sched: [8:4.00]
-; ATOM-NEXT:    pushfl # sched: [9:4.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_popa_popf_pusha_pushf:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    popal # sched: [3:1.00]
-; SLM-NEXT:    popfl # sched: [3:1.00]
-; SLM-NEXT:    pushal # sched: [1:1.00]
-; SLM-NEXT:    pushfl # sched: [1:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_popa_popf_pusha_pushf:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    popal # sched: [5:0.50]
-; SANDY-NEXT:    popfl # sched: [5:0.50]
-; SANDY-NEXT:    pushal # sched: [1:1.00]
-; SANDY-NEXT:    pushfl # sched: [1:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_popa_popf_pusha_pushf:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    popal # sched: [1:4.50]
-; HASWELL-NEXT:    popfl # sched: [5:0.50]
-; HASWELL-NEXT:    pushal # sched: [1:4.75]
-; HASWELL-NEXT:    pushfl # sched: [1:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_popa_popf_pusha_pushf:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    popal # sched: [5:0.50]
-; BROADWELL-NEXT:    popfl # sched: [5:0.50]
-; BROADWELL-NEXT:    pushal # sched: [1:1.00]
-; BROADWELL-NEXT:    pushfl # sched: [1:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_popa_popf_pusha_pushf:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    popal # sched: [5:0.50]
-; SKYLAKE-NEXT:    popfl # sched: [5:0.50]
-; SKYLAKE-NEXT:    pushal # sched: [1:1.00]
-; SKYLAKE-NEXT:    pushfl # sched: [1:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_popa_popf_pusha_pushf:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    popal # sched: [5:0.50]
-; SKX-NEXT:    popfl # sched: [5:0.50]
-; SKX-NEXT:    pushal # sched: [1:1.00]
-; SKX-NEXT:    pushfl # sched: [1:1.00]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_popa_popf_pusha_pushf:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popal # sched: [5:0.50]
-; BDVER2-NEXT:    popfl # sched: [5:0.50]
-; BDVER2-NEXT:    pushal # sched: [1:1.00]
-; BDVER2-NEXT:    pushfl # sched: [1:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_popa_popf_pusha_pushf:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    popal # sched: [3:1.00]
-; BTVER2-NEXT:    popfl # sched: [3:1.00]
-; BTVER2-NEXT:    pushal # sched: [1:1.00]
-; BTVER2-NEXT:    pushfl # sched: [1:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_popa_popf_pusha_pushf:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    popal # sched: [100:0.25]
-; ZNVER1-NEXT:    popfl # sched: [100:0.25]
-; ZNVER1-NEXT:    pushal # sched: [8:0.50]
-; ZNVER1-NEXT:    pushfl # sched: [100:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "popa \0A\09 popf \0A\09 pusha \0A\09 pushf", ""()
-  ret void
-}
-
-define void @test_ret() optsize {
-; GENERIC-LABEL: test_ret:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    retl
-; GENERIC-NEXT:    retl $4095 # imm = 0xFFF
-; GENERIC-NEXT:    lretl
-; GENERIC-NEXT:    lretl $4095 # imm = 0xFFF
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_ret:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-; ATOM-NEXT:    retl $4095 # imm = 0xFFF
-; ATOM-NEXT:    # sched: [1:1.00]
-; ATOM-NEXT:    lretl # sched: [79:39.50]
-; ATOM-NEXT:    lretl $4095 # imm = 0xFFF
-; ATOM-NEXT:    # sched: [79:39.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_ret:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-; SLM-NEXT:    retl $4095 # imm = 0xFFF
-; SLM-NEXT:    # sched: [4:1.00]
-; SLM-NEXT:    lretl # sched: [4:1.00]
-; SLM-NEXT:    lretl $4095 # imm = 0xFFF
-; SLM-NEXT:    # sched: [4:1.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_ret:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-; SANDY-NEXT:    retl $4095 # imm = 0xFFF
-; SANDY-NEXT:    # sched: [6:1.00]
-; SANDY-NEXT:    lretl # sched: [6:1.00]
-; SANDY-NEXT:    lretl $4095 # imm = 0xFFF
-; SANDY-NEXT:    # sched: [6:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_ret:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-; HASWELL-NEXT:    retl $4095 # imm = 0xFFF
-; HASWELL-NEXT:    # sched: [1:2.00]
-; HASWELL-NEXT:    lretl # sched: [6:0.50]
-; HASWELL-NEXT:    lretl $4095 # imm = 0xFFF
-; HASWELL-NEXT:    # sched: [1:2.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_ret:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-; BROADWELL-NEXT:    retl $4095 # imm = 0xFFF
-; BROADWELL-NEXT:    # sched: [6:0.50]
-; BROADWELL-NEXT:    lretl # sched: [6:0.50]
-; BROADWELL-NEXT:    lretl $4095 # imm = 0xFFF
-; BROADWELL-NEXT:    # sched: [6:0.50]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_ret:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-; SKYLAKE-NEXT:    retl $4095 # imm = 0xFFF
-; SKYLAKE-NEXT:    # sched: [6:0.50]
-; SKYLAKE-NEXT:    lretl # sched: [6:0.50]
-; SKYLAKE-NEXT:    lretl $4095 # imm = 0xFFF
-; SKYLAKE-NEXT:    # sched: [6:0.50]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_ret:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-; SKX-NEXT:    retl $4095 # imm = 0xFFF
-; SKX-NEXT:    # sched: [6:0.50]
-; SKX-NEXT:    lretl # sched: [6:0.50]
-; SKX-NEXT:    lretl $4095 # imm = 0xFFF
-; SKX-NEXT:    # sched: [6:0.50]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_ret:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-; BDVER2-NEXT:    retl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [5:1.00]
-; BDVER2-NEXT:    lretl # sched: [5:1.00]
-; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [5:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_ret:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-; BTVER2-NEXT:    retl $4095 # imm = 0xFFF
-; BTVER2-NEXT:    # sched: [4:1.00]
-; BTVER2-NEXT:    lretl # sched: [4:1.00]
-; BTVER2-NEXT:    lretl $4095 # imm = 0xFFF
-; BTVER2-NEXT:    # sched: [4:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_ret:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-; ZNVER1-NEXT:    retl $4095 # imm = 0xFFF
-; ZNVER1-NEXT:    # sched: [5:0.50]
-; ZNVER1-NEXT:    lretl # sched: [1:0.50]
-; ZNVER1-NEXT:    lretl $4095 # imm = 0xFFF
-; ZNVER1-NEXT:    # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  call void asm sideeffect "ret \0A\09 ret $0 \0A\09 lret \0A\09 lret $0", "i"(i16 4095)
-  ret void
-}
-
-define i8 @test_salc() optsize {
-; GENERIC-LABEL: test_salc:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    salc
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_salc:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    salc # sched: [1:0.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_salc:
-; SLM:       # %bb.0:
-; SLM-NEXT:    #APP
-; SLM-NEXT:    salc # sched: [1:0.50]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_salc:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    salc # sched: [1:0.33]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_salc:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    salc # sched: [1:0.25]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_salc:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    salc # sched: [1:0.25]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_salc:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    salc # sched: [1:0.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_salc:
-; SKX:       # %bb.0:
-; SKX-NEXT:    #APP
-; SKX-NEXT:    salc # sched: [1:0.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_salc:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    salc # sched: [1:0.50]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_salc:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    salc # sched: [1:0.50]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_salc:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    salc # sched: [1:0.25]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  %1 = tail call i8 asm "salc", "=r"() nounwind
-  ret i8 %1
-}
-
-; TODO - test_sgdt
-; TODO - test_sidt
-
-define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
-; GENERIC-LABEL: test_xchg_32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    xchgl %eax, %eax
-; GENERIC-NEXT:    xchgl %ecx, %eax
-; GENERIC-NEXT:    xchgl %eax, (%edx)
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retl
-;
-; ATOM-LABEL: test_xchg_32:
-; ATOM:       # %bb.0:
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
-; ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
-; ATOM-NEXT:    #APP
-; ATOM-NEXT:    xchgl %eax, %eax # sched: [2:1.00]
-; ATOM-NEXT:    xchgl %ecx, %eax # sched: [2:1.00]
-; ATOM-NEXT:    xchgl %eax, (%edx) # sched: [3:1.50]
-; ATOM-NEXT:    #NO_APP
-; ATOM-NEXT:    retl # sched: [79:39.50]
-;
-; SLM-LABEL: test_xchg_32:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; SLM-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
-; SLM-NEXT:    #APP
-; SLM-NEXT:    xchgl %eax, %eax # sched: [1:0.50]
-; SLM-NEXT:    xchgl %ecx, %eax # sched: [1:0.50]
-; SLM-NEXT:    xchgl %eax, (%edx) # sched: [4:2.00]
-; SLM-NEXT:    #NO_APP
-; SLM-NEXT:    retl # sched: [4:1.00]
-;
-; SANDY-LABEL: test_xchg_32:
-; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SANDY-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SANDY-NEXT:    #APP
-; SANDY-NEXT:    xchgl %eax, %eax # sched: [2:1.00]
-; SANDY-NEXT:    xchgl %ecx, %eax # sched: [2:1.00]
-; SANDY-NEXT:    xchgl %eax, (%edx) # sched: [6:1.00]
-; SANDY-NEXT:    #NO_APP
-; SANDY-NEXT:    retl # sched: [6:1.00]
-;
-; HASWELL-LABEL: test_xchg_32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; HASWELL-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    xchgl %eax, %eax # sched: [2:0.75]
-; HASWELL-NEXT:    xchgl %ecx, %eax # sched: [2:0.75]
-; HASWELL-NEXT:    xchgl %eax, (%edx) # sched: [9:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retl # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_xchg_32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BROADWELL-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    xchgl %eax, %eax # sched: [2:0.75]
-; BROADWELL-NEXT:    xchgl %ecx, %eax # sched: [2:0.75]
-; BROADWELL-NEXT:    xchgl %eax, (%edx) # sched: [8:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retl # sched: [6:0.50]
-;
-; SKYLAKE-LABEL: test_xchg_32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    xchgl %eax, %eax # sched: [2:0.75]
-; SKYLAKE-NEXT:    xchgl %ecx, %eax # sched: [2:0.75]
-; SKYLAKE-NEXT:    xchgl %eax, (%edx) # sched: [10:1.25]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retl # sched: [6:0.50]
-;
-; SKX-LABEL: test_xchg_32:
-; SKX:       # %bb.0:
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; SKX-NEXT:    #APP
-; SKX-NEXT:    xchgl %eax, %eax # sched: [2:0.75]
-; SKX-NEXT:    xchgl %ecx, %eax # sched: [2:0.75]
-; SKX-NEXT:    xchgl %eax, (%edx) # sched: [10:1.25]
-; SKX-NEXT:    #NO_APP
-; SKX-NEXT:    retl # sched: [6:0.50]
-;
-; BDVER2-LABEL: test_xchg_32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
-; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
-; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgl %eax, %eax # sched: [1:1.00]
-; BDVER2-NEXT:    xchgl %ecx, %eax # sched: [1:1.00]
-; BDVER2-NEXT:    xchgl %eax, (%edx) # sched: [5:1.00]
-; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_xchg_32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
-; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
-; BTVER2-NEXT:    #APP
-; BTVER2-NEXT:    xchgl %eax, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    xchgl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    xchgl %eax, (%edx) # sched: [4:1.00]
-; BTVER2-NEXT:    #NO_APP
-; BTVER2-NEXT:    retl # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_xchg_32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    xchgl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    xchgl %ecx, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    xchgl %eax, (%edx) # sched: [5:0.50]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retl # sched: [1:0.50]
-  tail call void asm "xchg %EAX, $0 \0A\09 xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2) nounwind
-  ret void
-}




More information about the llvm-commits mailing list