[llvm-branch-commits] [llvm] 204bdc5 - [InstCombine][x86] fix insertion point bug in vector demanded elts fold (PR48476)
Sanjay Patel via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Dec 11 14:28:13 PST 2020
Author: Sanjay Patel
Date: 2020-12-11T17:23:35-05:00
New Revision: 204bdc5322cc89603d503fb1f02a0eba19a1b496
URL: https://github.com/llvm/llvm-project/commit/204bdc5322cc89603d503fb1f02a0eba19a1b496
DIFF: https://github.com/llvm/llvm-project/commit/204bdc5322cc89603d503fb1f02a0eba19a1b496.diff
LOG: [InstCombine][x86] fix insertion point bug in vector demanded elts fold (PR48476)
This transform was added at:
c63799fc52ff
>From what I see, it's the first demanded elements transform that adds
a new instruction using the IRBuilder. There are similar folds in
the generic demanded bits chunk of instcombine that also use the
InsertPointGuard code pattern.
The tests here would assert/crash because the new instruction was
being added at the start of the demanded elements analysis rather
than at the instruction that is being replaced.
Added:
Modified:
llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 3b05dba57a33..ca026baa2c41 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -1916,13 +1916,20 @@ Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
case Intrinsic::x86_sse3_addsub_ps:
case Intrinsic::x86_avx_addsub_pd_256:
case Intrinsic::x86_avx_addsub_ps_256: {
+ // If none of the even or none of the odd lanes are required, turn this
+ // into a generic FP math instruction.
APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
- if (DemandedElts.isSubsetOf(SubMask))
- return IC.Builder.CreateFSub(II.getArgOperand(0), II.getArgOperand(1));
-
APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
- if (DemandedElts.isSubsetOf(AddMask))
- return IC.Builder.CreateFAdd(II.getArgOperand(0), II.getArgOperand(1));
+ bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
+ bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
+ if (IsSubOnly || IsAddOnly) {
+ assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+ Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
+ return IC.Builder.CreateBinOp(
+ IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
+ }
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll b/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
index d051732ee819..0b9831be8fcf 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
@@ -5,6 +5,7 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>)
declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>)
declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>)
declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8 immarg) #0
;
; Demanded Elts
@@ -164,4 +165,30 @@ define void @PR46277(float %0, float %1, float %2, float %3, <4 x float> %4, flo
ret void
}
+define double @PR48476_fsub(<2 x double> %x) {
+; CHECK-LABEL: @PR48476_fsub(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> <double 0.000000e+00, double undef>, [[X:%.*]]
+; CHECK-NEXT: [[T2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[X]], i8 6)
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[T2]], i32 0
+; CHECK-NEXT: ret double [[VECEXT]]
+;
+ %t1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> zeroinitializer, <2 x double> %x)
+ %t2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %t1, <2 x double> %x, i8 6)
+ %vecext = extractelement <2 x double> %t2, i32 0
+ ret double %vecext
+}
+define double @PR48476_fadd_fsub(<2 x double> %x) {
+; CHECK-LABEL: @PR48476_fadd_fsub(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[S]], [[X]]
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT: ret double [[VECEXT]]
+;
+ %t1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> zeroinitializer, <2 x double> %x)
+ %s = shufflevector <2 x double> %t1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %t2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %s, <2 x double> %x)
+ %vecext = extractelement <2 x double> %t2, i32 0
+ ret double %vecext
+}
More information about the llvm-branch-commits
mailing list