[llvm] r289377 - [X86][InstCombine] Add support for scalar FMA intrinsics to SimplifyDemandedVectorElts.

Sun Dec 11 00:54:53 PST 2016

Author: ctopper
Date: Sun Dec 11 02:54:52 2016
New Revision: 289377

URL: http://llvm.org/viewvc/llvm-project?rev=289377&view=rev
Log:
[X86][InstCombine] Add support for scalar FMA intrinsics to SimplifyDemandedVectorElts.

This teaches SimplifyDemandedElts that the FMA can be removed if the lower element isn't used. It also teaches it that if upper elements of the first operand aren't used then we can simplify them.

Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-fma.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp?rev=289377&r1=289376&r2=289377&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp Sun Dec 11 02:54:52 2016
@@ -981,6 +981,7 @@ Value *InstCombiner::SimplifyDemandedVec
 
   bool MadeChange = false;
   APInt UndefElts2(VWidth, 0);
+  APInt UndefElts3(VWidth, 0);
   Value *TmpV;
   switch (I->getOpcode()) {
   default: break;
@@ -1298,6 +1299,34 @@ Value *InstCombiner::SimplifyDemandedVec
       UndefElts &= UndefElts2;
       break;
 
+    case Intrinsic::x86_fma_vfmadd_ss:
+    case Intrinsic::x86_fma_vfmsub_ss:
+    case Intrinsic::x86_fma_vfnmadd_ss:
+    case Intrinsic::x86_fma_vfnmsub_ss:
+    case Intrinsic::x86_fma_vfmadd_sd:
+    case Intrinsic::x86_fma_vfmsub_sd:
+    case Intrinsic::x86_fma_vfnmadd_sd:
+    case Intrinsic::x86_fma_vfnmsub_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
+                                        UndefElts3, Depth + 1);
+      if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (DemandedElts.getLoBits(1) != 1)
+        return II->getArgOperand(0);
+
+      // Output elements are undefined if all three are undefined.  Consider
+      // things like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      UndefElts &= UndefElts3;
+      break;
+
     // SSE4A instructions leave the upper 64-bits of the 128-bit result
     // in an undefined state.
     case Intrinsic::x86_sse4a_extrq:

Modified: llvm/trunk/test/Transforms/InstCombine/x86-fma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-fma.ll?rev=289377&r1=289376&r2=289377&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-fma.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-fma.ll Sun Dec 11 02:54:52 2016
@@ -19,6 +19,32 @@ define <4 x float> @test_vfmadd_ss(<4 x
   ret <4 x float> %res
 }
 
+define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
@@ -32,6 +58,28 @@ define <2 x double> @test_vfmadd_sd(<2 x
   ret <2 x double> %res
 }
 
+define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <4 x float> @test_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
@@ -49,6 +97,32 @@ define <4 x float> @test_vfmsub_ss(<4 x
   ret <4 x float> %res
 }
 
+define float @test_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
@@ -62,6 +136,28 @@ define <2 x double> @test_vfmsub_sd(<2 x
   ret <2 x double> %res
 }
 
+define double @test_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <4 x float> @test_vfnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
@@ -79,6 +175,32 @@ define <4 x float> @test_vfnmadd_ss(<4 x
   ret <4 x float> %res
 }
 
+define float @test_vfnmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfnmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
@@ -92,6 +214,28 @@ define <2 x double> @test_vfnmadd_sd(<2
   ret <2 x double> %res
 }
 
+define double @test_vfnmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfnmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
 
 define <4 x float> @test_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
@@ -109,6 +253,32 @@ define <4 x float> @test_vfnmsub_ss(<4 x
   ret <4 x float> %res
 }
 
+define float @test_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
@@ -121,3 +291,25 @@ define <2 x double> @test_vfnmsub_sd(<2
   %res = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
   ret <2 x double> %res
 }
+
+define double @test_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}