[llvm] b6e2796 - [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases.

Sun Mar 1 16:38:57 PST 2020

Author: Craig Topper
Date: 2020-03-01T16:38:08-08:00
New Revision: b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e

URL: https://github.com/llvm/llvm-project/commit/b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e
DIFF: https://github.com/llvm/llvm-project/commit/b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e.diff

LOG: [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases.

Previously we would only check for another commutable operand if the first commute was an aggressive commute.

But if we have two kill operands and neither is tied to the def at the start, we should consider both operands as the one to use as the new def.

This improves the loop in the fma-commute-loop.ll test. This test is derived from a post from discourse here https://llvm.discourse.group/t/unnecessary-vmovapd-instructions-generated-can-you-hint-in-favor-of-vfmadd231pd/582

Differential Revision: https://reviews.llvm.org/D75016

Added: 
    

Modified: 
    llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
    llvm/test/CodeGen/X86/fma-commute-loop.ll
    llvm/test/CodeGen/X86/recip-fastmath.ll
    llvm/test/CodeGen/X86/recip-fastmath2.ll
    llvm/test/CodeGen/X86/sqrt-fastmath.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 2b1ffab74b6f..336077f297d2 100644

--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1238,21 +1238,18 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
                                         Dist)) {
       MadeChange = true;
       ++NumCommuted;
-      if (AggressiveCommute) {
+      if (AggressiveCommute)
         ++NumAggrCommuted;
-        // There might be more than two commutable operands, update BaseOp and
-        // continue scanning.
-        // FIXME: This assumes that the new instruction's operands are in the
-        // same positions and were simply swapped.
-        BaseOpReg = OtherOpReg;
-        BaseOpKilled = OtherOpKilled;
-        // Resamples OpsNum in case the number of operands was reduced. This
-        // happens with X86.
-        OpsNum = MI->getDesc().getNumOperands();
-        continue;
-      }
-      // If this was a commute based on kill, we won't do better continuing.
-      return MadeChange;
+
+      // There might be more than two commutable operands, update BaseOp and
+      // continue scanning.
+      // FIXME: This assumes that the new instruction's operands are in the
+      // same positions and were simply swapped.
+      BaseOpReg = OtherOpReg;
+      BaseOpKilled = OtherOpKilled;
+      // Resamples OpsNum in case the number of operands was reduced. This
+      // happens with X86.
+      OpsNum = MI->getDesc().getNumOperands();
     }
   }
   return MadeChange;

diff  --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll
index f96e9c12dba3..6b0bceb88f47 100644
--- a/llvm/test/CodeGen/X86/fma-commute-loop.ll
+++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll
@@ -25,25 +25,23 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2,
 ; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %r12
 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vxorpd %xmm5, %xmm5, %xmm5
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %bb15
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmovapd %zmm5, %zmm6
-; CHECK-NEXT:    vmovapd %zmm4, %zmm7
-; CHECK-NEXT:    vmovupd (%rax,%r11,8), %zmm4
-; CHECK-NEXT:    vmovupd (%rax,%r13,8), %zmm5
+; CHECK-NEXT:    vmovupd (%rax,%r11,8), %zmm6
+; CHECK-NEXT:    vmovupd (%rax,%r13,8), %zmm7
 ; CHECK-NEXT:    vmovupd (%rax,%r12,8), %zmm8
 ; CHECK-NEXT:    vbroadcastsd (%r15,%rbx,8), %zmm9
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm4 * zmm9) + zmm0
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm5 * zmm9) + zmm1
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
 ; CHECK-NEXT:    vbroadcastsd (%r14,%rbx,8), %zmm9
-; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm4 = (zmm9 * zmm4) + zmm7
-; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm5 = (zmm9 * zmm5) + zmm6
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm3 = (zmm8 * zmm9) + zmm3
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
 ; CHECK-NEXT:    incq %rbx
 ; CHECK-NEXT:    cmpq %rbx, %r10
 ; CHECK-NEXT:    jne LBB0_1
@@ -51,9 +49,9 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2,
 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi)
 ; CHECK-NEXT:    vmovapd %zmm1, (%rsi)
 ; CHECK-NEXT:    vmovapd %zmm2, (%rdx)
-; CHECK-NEXT:    vmovapd %zmm4, (%rcx)
-; CHECK-NEXT:    vmovapd %zmm5, (%r8)
-; CHECK-NEXT:    vmovapd %zmm3, (%r9)
+; CHECK-NEXT:    vmovapd %zmm3, (%rcx)
+; CHECK-NEXT:    vmovapd %zmm4, (%r8)
+; CHECK-NEXT:    vmovapd %zmm5, (%r9)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index b1b9c1b735c4..99ce5eba08ff 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -144,9 +144,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; FMA-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT:    vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: f32_one_step_variables:
@@ -181,9 +180,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; HASWELL-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT:    vmovaps %xmm2, %xmm0
+; HASWELL-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
@@ -200,9 +198,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; AVX512-NEXT:    retq
   %div = fdiv fast float %x, %y
   ret float %div
@@ -445,10 +442,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ;
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
@@ -463,10 +461,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ;
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; KNL-NEXT:    vrcpps %xmm0, %xmm2
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; KNL-NEXT:    vmovaps %xmm1, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step:
@@ -505,9 +504,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm1, %xmm2
 ; FMA-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT:    vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: v4f32_one_step_variables:
@@ -542,9 +540,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm1, %xmm2
 ; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT:    vmovaps %xmm2, %xmm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
@@ -561,9 +558,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpps %xmm1, %xmm2
 ; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> %x, %y
   ret <4 x float> %div
@@ -816,10 +812,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
@@ -834,10 +831,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; KNL-NEXT:    vrcpps %ymm0, %ymm2
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step:

diff  --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index c2bd531049f6..6e67e6eb452f 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -530,10 +530,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm0
+; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
@@ -552,10 +552,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1
 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm0
+; KNL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
@@ -892,10 +892,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm0
+; HASWELL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
@@ -914,10 +914,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1
 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm0
+; KNL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:

diff  --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 37e6b6954dc2..3986c8f863d7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -328,10 +328,10 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
@@ -401,10 +401,10 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt