[llvm] r347303 - [X86][SSE] Add computeKnownBits/ComputeNumSignBits support for PACKSS/PACKUS instructions.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 20 05:23:38 PST 2018
Author: rksimon
Date: Tue Nov 20 05:23:37 2018
New Revision: 347303
URL: http://llvm.org/viewvc/llvm-project?rev=347303&view=rev
Log:
[X86][SSE] Add computeKnownBits/ComputeNumSignBits support for PACKSS/PACKUS instructions.
Pull out getPackDemandedElts demanded elts remapping helper from computeKnownBitsForTargetNode and use in computeKnownBits/ComputeNumSignBits.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/combine-srl.ll
llvm/trunk/test/CodeGen/X86/psubus.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=347303&r1=347302&r2=347303&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Nov 20 05:23:37 2018
@@ -5932,6 +5932,31 @@ static void createPackShuffleMask(MVT VT
}
}
+// Split the demanded elts of a PACKSS/PACKUS node between its operands.
+static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumInnerElts = NumElts / 2;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumInnerEltsPerLane = NumInnerElts / NumLanes;
+
+ DemandedLHS = APInt::getNullValue(NumInnerElts);
+ DemandedRHS = APInt::getNullValue(NumInnerElts);
+
+ // Map DemandedElts to the packed operands.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
+ int OuterIdx = (Lane * NumEltsPerLane) + Elt;
+ int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
+ if (DemandedElts[OuterIdx])
+ DemandedLHS.setBit(InnerIdx);
+ if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
+ DemandedRHS.setBit(InnerIdx);
+ }
+ }
+}
+
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -29938,12 +29963,24 @@ void X86TargetLowering::computeKnownBits
}
case X86ISD::PACKUS: {
// PACKUS is just a truncation if the upper half is zero.
- // TODO: Add DemandedElts support.
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ Known.One = APInt::getAllOnesValue(BitWidth * 2);
+ Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
+
KnownBits Known2;
- DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
- DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ if (!!DemandedLHS) {
+ DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1);
+ Known.One &= Known2.One;
+ Known.Zero &= Known2.Zero;
+ }
+ if (!!DemandedRHS) {
+ DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1);
+ Known.One &= Known2.One;
+ Known.Zero &= Known2.Zero;
+ }
+
if (Known.countMinLeadingZeros() < BitWidth)
Known.resetAll();
Known = Known.trunc(BitWidth);
@@ -30039,10 +30076,16 @@ unsigned X86TargetLowering::ComputeNumSi
case X86ISD::PACKSS: {
// PACKSS is just a truncation if the sign bits extend to the packed size.
- // TODO: Add DemandedElts support.
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
+ DemandedRHS);
+
unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
- unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
- unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+ unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
+ if (!!DemandedLHS)
+ Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+ if (!!DemandedRHS)
+ Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
unsigned Tmp = std::min(Tmp0, Tmp1);
if (Tmp > (SrcBits - VTBits))
return Tmp - (SrcBits - VTBits);
@@ -32226,24 +32269,8 @@ bool X86TargetLowering::SimplifyDemanded
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
- int NumLanes = VT.getSizeInBits() / 128;
- int NumInnerElts = NumElts / 2;
- int NumEltsPerLane = NumElts / NumLanes;
- int NumInnerEltsPerLane = NumInnerElts / NumLanes;
-
- // Map DemandedElts to the packed operands.
- APInt DemandedLHS = APInt::getNullValue(NumInnerElts);
- APInt DemandedRHS = APInt::getNullValue(NumInnerElts);
- for (int Lane = 0; Lane != NumLanes; ++Lane) {
- for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
- int OuterIdx = (Lane * NumEltsPerLane) + Elt;
- int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
- if (DemandedElts[OuterIdx])
- DemandedLHS.setBit(InnerIdx);
- if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
- DemandedRHS.setBit(InnerIdx);
- }
- }
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
Modified: llvm/trunk/test/CodeGen/X86/combine-srl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-srl.ll?rev=347303&r1=347302&r2=347303&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-srl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-srl.ll Tue Nov 20 05:23:37 2018
@@ -237,25 +237,7 @@ define <4 x i32> @combine_vec_lshr_trunc
define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlq $51, %xmm2
-; SSE-NEXT: psrlq $50, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $49, %xmm2
-; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: packusdw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $27, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrld $25, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $26, %xmm1
-; SSE-NEXT: psrld $24, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
Modified: llvm/trunk/test/CodeGen/X86/psubus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/psubus.ll?rev=347303&r1=347302&r2=347303&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/psubus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/psubus.ll Tue Nov 20 05:23:37 2018
@@ -1681,17 +1681,17 @@ define <8 x i16> @psubus_8i64_max(<8 x i
;
; SSE41-LABEL: psubus_8i64_max:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movdqa %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT: movdqa %xmm8, %xmm7
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT: movdqa %xmm9, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
@@ -1699,12 +1699,12 @@ define <8 x i16> @psubus_8i64_max(<8 x i
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm4
@@ -1712,21 +1712,21 @@ define <8 x i16> @psubus_8i64_max(<8 x i
; SSE41-NEXT: packusdw %xmm11, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm9, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pand %xmm10, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pxor %xmm1, %xmm6
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE41-NEXT: pand %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -1734,11 +1734,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE41-NEXT: packusdw %xmm3, %xmm7
; SSE41-NEXT: packusdw %xmm4, %xmm7
-; SSE41-NEXT: psubusw %xmm7, %xmm10
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSE41-NEXT: packusdw %xmm10, %xmm0
+; SSE41-NEXT: psubusw %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: psubus_8i64_max:
More information about the llvm-commits
mailing list