[llvm] r289632 - [X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.

Tue Dec 13 21:43:06 PST 2016

Author: ctopper
Date: Tue Dec 13 23:43:05 2016
New Revision: 289632

URL: http://llvm.org/viewvc/llvm-project?rev=289632&view=rev
Log:
[X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.

Now we pass a modified version of DemandedElts to each operand and we calculate undef elts correctly.

Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=289632&r1=289631&r2=289632&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Tue Dec 13 23:43:05 2016
@@ -1754,14 +1754,6 @@ Instruction *InstCombiner::visitCallInst
     break;
   }
 
-  case Intrinsic::x86_fma_vfmadd_ss:
-  case Intrinsic::x86_fma_vfmsub_ss:
-  case Intrinsic::x86_fma_vfnmadd_ss:
-  case Intrinsic::x86_fma_vfnmsub_ss:
-  case Intrinsic::x86_fma_vfmadd_sd:
-  case Intrinsic::x86_fma_vfmsub_sd:
-  case Intrinsic::x86_fma_vfnmadd_sd:
-  case Intrinsic::x86_fma_vfnmsub_sd:
   case Intrinsic::x86_avx512_mask_add_ss_round:
   case Intrinsic::x86_avx512_mask_div_ss_round:
   case Intrinsic::x86_avx512_mask_mul_ss_round:
@@ -1793,6 +1785,14 @@ Instruction *InstCombiner::visitCallInst
     break;
   }
 
+  case Intrinsic::x86_fma_vfmadd_ss:
+  case Intrinsic::x86_fma_vfmsub_ss:
+  case Intrinsic::x86_fma_vfnmadd_ss:
+  case Intrinsic::x86_fma_vfnmsub_ss:
+  case Intrinsic::x86_fma_vfmadd_sd:
+  case Intrinsic::x86_fma_vfmsub_sd:
+  case Intrinsic::x86_fma_vfnmadd_sd:
+  case Intrinsic::x86_fma_vfnmsub_sd:
   case Intrinsic::x86_sse_cmp_ss:
   case Intrinsic::x86_sse_min_ss:
   case Intrinsic::x86_sse_max_ss:

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp?rev=289632&r1=289631&r2=289632&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp Tue Dec 13 23:43:05 2016
@@ -1349,6 +1349,9 @@ Value *InstCombiner::SimplifyDemandedVec
       break;
     }
 
+    // Three input scalar-as-vector operations that work column-wise. The high
+    // elements come from operand 0 and the low element is a function of all
+    // three inputs.
     case Intrinsic::x86_fma_vfmadd_ss:
     case Intrinsic::x86_fma_vfmsub_ss:
     case Intrinsic::x86_fma_vfnmadd_ss:
@@ -1360,6 +1363,13 @@ Value *InstCombiner::SimplifyDemandedVec
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0])
+        return II->getArgOperand(0);
+
+      // Only lower element is used for operand 1 and 2.
+      DemandedElts = 1;
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
                                         UndefElts2, Depth + 1);
       if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
@@ -1367,14 +1377,11 @@ Value *InstCombiner::SimplifyDemandedVec
                                         UndefElts3, Depth + 1);
       if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
 
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (DemandedElts.getLoBits(1) != 1)
-        return II->getArgOperand(0);
+      // Lower element is undefined if all three lower elements are undefined.
+      // Consider things like undef&0.  The result is known zero, not undef.
+      if (!UndefElts2[0] || !UndefElts3[0])
+        UndefElts.clearBit(0);
 
-      // Output elements are undefined if all three are undefined.  Consider
-      // things like undef&0.  The result is known zero, not undef.
-      UndefElts &= UndefElts2;
-      UndefElts &= UndefElts3;
       break;
 
     // SSE4A instructions leave the upper 64-bits of the 128-bit result