[llvm] b6e2796 - [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 1 16:38:57 PST 2020


Author: Craig Topper
Date: 2020-03-01T16:38:08-08:00
New Revision: b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e

URL: https://github.com/llvm/llvm-project/commit/b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e
DIFF: https://github.com/llvm/llvm-project/commit/b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e.diff

LOG: [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases.

Previously we would only check for another commutable operand if the first commute was an aggressive commute.

But if we have two kill operands and neither is tied to the def at the start, we should consider both operands as the one to use as the new def.

This improves the loop in the fma-commute-loop.ll test. This test is derived from a post from discourse here https://llvm.discourse.group/t/unnecessary-vmovapd-instructions-generated-can-you-hint-in-favor-of-vfmadd231pd/582

Differential Revision: https://reviews.llvm.org/D75016

Added: 
    

Modified: 
    llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
    llvm/test/CodeGen/X86/fma-commute-loop.ll
    llvm/test/CodeGen/X86/recip-fastmath.ll
    llvm/test/CodeGen/X86/recip-fastmath2.ll
    llvm/test/CodeGen/X86/sqrt-fastmath.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 2b1ffab74b6f..336077f297d2 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1238,21 +1238,18 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
                                         Dist)) {
       MadeChange = true;
       ++NumCommuted;
-      if (AggressiveCommute) {
+      if (AggressiveCommute)
         ++NumAggrCommuted;
-        // There might be more than two commutable operands, update BaseOp and
-        // continue scanning.
-        // FIXME: This assumes that the new instruction's operands are in the
-        // same positions and were simply swapped.
-        BaseOpReg = OtherOpReg;
-        BaseOpKilled = OtherOpKilled;
-        // Resamples OpsNum in case the number of operands was reduced. This
-        // happens with X86.
-        OpsNum = MI->getDesc().getNumOperands();
-        continue;
-      }
-      // If this was a commute based on kill, we won't do better continuing.
-      return MadeChange;
+
+      // There might be more than two commutable operands, update BaseOp and
+      // continue scanning.
+      // FIXME: This assumes that the new instruction's operands are in the
+      // same positions and were simply swapped.
+      BaseOpReg = OtherOpReg;
+      BaseOpKilled = OtherOpKilled;
+      // Resamples OpsNum in case the number of operands was reduced. This
+      // happens with X86.
+      OpsNum = MI->getDesc().getNumOperands();
     }
   }
   return MadeChange;

diff  --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll
index f96e9c12dba3..6b0bceb88f47 100644
--- a/llvm/test/CodeGen/X86/fma-commute-loop.ll
+++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll
@@ -25,25 +25,23 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2,
 ; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %r12
 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vxorpd %xmm5, %xmm5, %xmm5
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %bb15
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmovapd %zmm5, %zmm6
-; CHECK-NEXT:    vmovapd %zmm4, %zmm7
-; CHECK-NEXT:    vmovupd (%rax,%r11,8), %zmm4
-; CHECK-NEXT:    vmovupd (%rax,%r13,8), %zmm5
+; CHECK-NEXT:    vmovupd (%rax,%r11,8), %zmm6
+; CHECK-NEXT:    vmovupd (%rax,%r13,8), %zmm7
 ; CHECK-NEXT:    vmovupd (%rax,%r12,8), %zmm8
 ; CHECK-NEXT:    vbroadcastsd (%r15,%rbx,8), %zmm9
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm4 * zmm9) + zmm0
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm5 * zmm9) + zmm1
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
 ; CHECK-NEXT:    vbroadcastsd (%r14,%rbx,8), %zmm9
-; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm4 = (zmm9 * zmm4) + zmm7
-; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm5 = (zmm9 * zmm5) + zmm6
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm3 = (zmm8 * zmm9) + zmm3
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
+; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
 ; CHECK-NEXT:    incq %rbx
 ; CHECK-NEXT:    cmpq %rbx, %r10
 ; CHECK-NEXT:    jne LBB0_1
@@ -51,9 +49,9 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2,
 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi)
 ; CHECK-NEXT:    vmovapd %zmm1, (%rsi)
 ; CHECK-NEXT:    vmovapd %zmm2, (%rdx)
-; CHECK-NEXT:    vmovapd %zmm4, (%rcx)
-; CHECK-NEXT:    vmovapd %zmm5, (%r8)
-; CHECK-NEXT:    vmovapd %zmm3, (%r9)
+; CHECK-NEXT:    vmovapd %zmm3, (%rcx)
+; CHECK-NEXT:    vmovapd %zmm4, (%r8)
+; CHECK-NEXT:    vmovapd %zmm5, (%r9)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index b1b9c1b735c4..99ce5eba08ff 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -144,9 +144,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; FMA-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT:    vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: f32_one_step_variables:
@@ -181,9 +180,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; HASWELL-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT:    vmovaps %xmm2, %xmm0
+; HASWELL-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
@@ -200,9 +198,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; AVX512-NEXT:    retq
   %div = fdiv fast float %x, %y
   ret float %div
@@ -445,10 +442,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ;
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
@@ -463,10 +461,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ;
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; KNL-NEXT:    vrcpps %xmm0, %xmm2
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; KNL-NEXT:    vmovaps %xmm1, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step:
@@ -505,9 +504,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm1, %xmm2
 ; FMA-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT:    vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: v4f32_one_step_variables:
@@ -542,9 +540,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm1, %xmm2
 ; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT:    vmovaps %xmm2, %xmm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
@@ -561,9 +558,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpps %xmm1, %xmm2
 ; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> %x, %y
   ret <4 x float> %div
@@ -816,10 +812,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
@@ -834,10 +831,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; KNL-NEXT:    vrcpps %ymm0, %ymm2
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step:

diff  --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index c2bd531049f6..6e67e6eb452f 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -530,10 +530,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm0
+; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
@@ -552,10 +552,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1
 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm0
+; KNL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
@@ -892,10 +892,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm0
+; HASWELL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
@@ -914,10 +914,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1
 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm0
+; KNL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:

diff  --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 37e6b6954dc2..3986c8f863d7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -328,10 +328,10 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
@@ -401,10 +401,10 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT:    vmulps %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmulps %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt


        


More information about the llvm-commits mailing list