[llvm] r289632 - [X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 13 21:43:06 PST 2016
Author: ctopper
Date: Tue Dec 13 23:43:05 2016
New Revision: 289632
URL: http://llvm.org/viewvc/llvm-project?rev=289632&view=rev
Log:
[X86][InstCombine] Handle scalar fmadd intrinsics correctly in SimplifyDemandedVectorElts.
Now we pass a modified version of DemandedElts to each operand and we calculate undef elts correctly.
Modified:
llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=289632&r1=289631&r2=289632&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Tue Dec 13 23:43:05 2016
@@ -1754,14 +1754,6 @@ Instruction *InstCombiner::visitCallInst
break;
}
- case Intrinsic::x86_fma_vfmadd_ss:
- case Intrinsic::x86_fma_vfmsub_ss:
- case Intrinsic::x86_fma_vfnmadd_ss:
- case Intrinsic::x86_fma_vfnmsub_ss:
- case Intrinsic::x86_fma_vfmadd_sd:
- case Intrinsic::x86_fma_vfmsub_sd:
- case Intrinsic::x86_fma_vfnmadd_sd:
- case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_mul_ss_round:
@@ -1793,6 +1785,14 @@ Instruction *InstCombiner::visitCallInst
break;
}
+ case Intrinsic::x86_fma_vfmadd_ss:
+ case Intrinsic::x86_fma_vfmsub_ss:
+ case Intrinsic::x86_fma_vfnmadd_ss:
+ case Intrinsic::x86_fma_vfnmsub_ss:
+ case Intrinsic::x86_fma_vfmadd_sd:
+ case Intrinsic::x86_fma_vfmsub_sd:
+ case Intrinsic::x86_fma_vfnmadd_sd:
+ case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp?rev=289632&r1=289631&r2=289632&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp Tue Dec 13 23:43:05 2016
@@ -1349,6 +1349,9 @@ Value *InstCombiner::SimplifyDemandedVec
break;
}
+ // Three input scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element is a function of all
+ // three inputs.
case Intrinsic::x86_fma_vfmadd_ss:
case Intrinsic::x86_fma_vfmsub_ss:
case Intrinsic::x86_fma_vfnmadd_ss:
@@ -1360,6 +1363,13 @@ Value *InstCombiner::SimplifyDemandedVec
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0])
+ return II->getArgOperand(0);
+
+ // Only lower element is used for operand 1 and 2.
+ DemandedElts = 1;
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
UndefElts2, Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
@@ -1367,14 +1377,11 @@ Value *InstCombiner::SimplifyDemandedVec
UndefElts3, Depth + 1);
if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
- // If lowest element of a scalar op isn't used then use Arg0.
- if (DemandedElts.getLoBits(1) != 1)
- return II->getArgOperand(0);
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
- // Output elements are undefined if all three are undefined. Consider
- // things like undef&0. The result is known zero, not undef.
- UndefElts &= UndefElts2;
- UndefElts &= UndefElts3;
break;
// SSE4A instructions leave the upper 64-bits of the 128-bit result
More information about the llvm-commits
mailing list