[llvm] dcf4657 - [X86] combineAnd - add SimplifyMultipleUseDemandedBits handling to masked vector element analysis
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 12 07:32:38 PST 2022
Author: Simon Pilgrim
Date: 2022-02-12T15:30:53Z
New Revision: dcf465731d88ec323edb435df953bf6935c216e4
URL: https://github.com/llvm/llvm-project/commit/dcf465731d88ec323edb435df953bf6935c216e4
DIFF: https://github.com/llvm/llvm-project/commit/dcf465731d88ec323edb435df953bf6935c216e4.diff
LOG: [X86] combineAnd - add SimplifyMultipleUseDemandedBits handling to masked vector element analysis
Extend the existing fold to use SimplifyMultipleUseDemandedBits as well as SimplifyDemandedVectorElts/SimplifyDemandedBits when attempting to simplify based off known zero vector elements.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/psubus.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 924454a465ae..d779a469c56a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47090,30 +47090,44 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// If either operand is a constant mask, then only the elements that aren't
// zero are actually demanded by the other operand.
- auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+ auto GetDemandedMasks = [&](SDValue Op) {
APInt UndefElts;
SmallVector<APInt> EltBits;
int NumElts = VT.getVectorNumElements();
int EltSizeInBits = VT.getScalarSizeInBits();
- if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
- return false;
-
- APInt DemandedBits = APInt::getZero(EltSizeInBits);
- APInt DemandedElts = APInt::getZero(NumElts);
- for (int I = 0; I != NumElts; ++I)
- if (!EltBits[I].isZero()) {
- DemandedBits |= EltBits[I];
- DemandedElts.setBit(I);
- }
-
- return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI) ||
- TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI);
+ APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+ APInt DemandedElts = APInt::getAllOnes(NumElts);
+ if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+ EltBits)) {
+ DemandedBits.clearAllBits();
+ DemandedElts.clearAllBits();
+ for (int I = 0; I != NumElts; ++I)
+ if (!EltBits[I].isZero()) {
+ DemandedBits |= EltBits[I];
+ DemandedElts.setBit(I);
+ }
+ }
+ return std::make_pair(DemandedBits, DemandedElts);
};
- if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+ std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
+ std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);
+
+ if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
+ TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
}
+
+ SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,
+ Demand0.second, DAG);
+ SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,
+ Demand1.second, DAG);
+ if (NewN0 || NewN1)
+ return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
+ NewN1 ? NewN1 : N1);
}
// Attempt to combine a scalar bitmask AND with an extracted shuffle.
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index f0cb154a3011..61bc41774008 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -2672,73 +2672,73 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
; SSE2OR3-LABEL: test33:
; SSE2OR3: # %bb.0:
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
-; SSE2OR3-NEXT: pxor %xmm9, %xmm6
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
+; SSE2OR3-NEXT: pxor %xmm8, %xmm6
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm7
; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm6
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: pand %xmm8, %xmm6
+; SSE2OR3-NEXT: pand %xmm10, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2OR3-NEXT: por %xmm6, %xmm7
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10
; SSE2OR3-NEXT: pand %xmm7, %xmm3
-; SSE2OR3-NEXT: pandn %xmm8, %xmm7
+; SSE2OR3-NEXT: pxor %xmm10, %xmm7
; SSE2OR3-NEXT: por %xmm3, %xmm7
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
-; SSE2OR3-NEXT: pxor %xmm9, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
+; SSE2OR3-NEXT: pxor %xmm8, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm3
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm11, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2OR3-NEXT: por %xmm3, %xmm6
; SSE2OR3-NEXT: pand %xmm6, %xmm2
-; SSE2OR3-NEXT: pandn %xmm8, %xmm6
+; SSE2OR3-NEXT: pxor %xmm10, %xmm6
; SSE2OR3-NEXT: por %xmm2, %xmm6
; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
; SSE2OR3-NEXT: psubd %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm6
-; SSE2OR3-NEXT: pxor %xmm9, %xmm0
+; SSE2OR3-NEXT: pxor %xmm8, %xmm6
+; SSE2OR3-NEXT: pxor %xmm8, %xmm0
; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
; SSE2OR3-NEXT: pand %xmm2, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm3
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
-; SSE2OR3-NEXT: pandn %xmm8, %xmm3
+; SSE2OR3-NEXT: pxor %xmm10, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm5
+; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm5
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pandn %xmm8, %xmm5
-; SSE2OR3-NEXT: por %xmm4, %xmm5
+; SSE2OR3-NEXT: pxor %xmm5, %xmm10
+; SSE2OR3-NEXT: pand %xmm4, %xmm5
+; SSE2OR3-NEXT: por %xmm10, %xmm5
; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
; SSE2OR3-NEXT: psubd %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm5
-; SSE2OR3-NEXT: pxor %xmm9, %xmm1
+; SSE2OR3-NEXT: pxor %xmm8, %xmm5
+; SSE2OR3-NEXT: pxor %xmm8, %xmm1
; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2OR3-NEXT: pand %xmm2, %xmm1
; SSE2OR3-NEXT: retq
@@ -2904,73 +2904,73 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
; SSE2OR3-NEXT: pand %xmm6, %xmm1
; SSE2OR3-NEXT: pand %xmm6, %xmm0
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
-; SSE2OR3-NEXT: pxor %xmm9, %xmm6
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
+; SSE2OR3-NEXT: pxor %xmm8, %xmm6
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm7
; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm6
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: pand %xmm8, %xmm6
+; SSE2OR3-NEXT: pand %xmm10, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2OR3-NEXT: por %xmm6, %xmm7
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10
; SSE2OR3-NEXT: pand %xmm7, %xmm3
-; SSE2OR3-NEXT: pandn %xmm8, %xmm7
+; SSE2OR3-NEXT: pxor %xmm10, %xmm7
; SSE2OR3-NEXT: por %xmm3, %xmm7
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
-; SSE2OR3-NEXT: pxor %xmm9, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
+; SSE2OR3-NEXT: pxor %xmm8, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm3
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm11, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2OR3-NEXT: por %xmm3, %xmm6
; SSE2OR3-NEXT: pand %xmm6, %xmm2
-; SSE2OR3-NEXT: pandn %xmm8, %xmm6
+; SSE2OR3-NEXT: pxor %xmm10, %xmm6
; SSE2OR3-NEXT: por %xmm2, %xmm6
; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
; SSE2OR3-NEXT: psubd %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm6
-; SSE2OR3-NEXT: por %xmm9, %xmm0
+; SSE2OR3-NEXT: pxor %xmm8, %xmm6
+; SSE2OR3-NEXT: por %xmm8, %xmm0
; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
; SSE2OR3-NEXT: pand %xmm2, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm3
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
-; SSE2OR3-NEXT: pandn %xmm8, %xmm3
+; SSE2OR3-NEXT: pxor %xmm10, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm5
+; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm9, %xmm5
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2OR3-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2OR3-NEXT: pand %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pandn %xmm8, %xmm5
-; SSE2OR3-NEXT: por %xmm4, %xmm5
+; SSE2OR3-NEXT: pxor %xmm5, %xmm10
+; SSE2OR3-NEXT: pand %xmm4, %xmm5
+; SSE2OR3-NEXT: por %xmm10, %xmm5
; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
; SSE2OR3-NEXT: psubd %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm5
-; SSE2OR3-NEXT: por %xmm9, %xmm1
+; SSE2OR3-NEXT: pxor %xmm8, %xmm5
+; SSE2OR3-NEXT: por %xmm8, %xmm1
; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2OR3-NEXT: pand %xmm2, %xmm1
; SSE2OR3-NEXT: retq
More information about the llvm-commits
mailing list