[llvm] 358df06 - [X86] Improve `matchBinaryShuffle()`'s `BLEND` lowering with per-element all-zero/all-ones knowledge
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 17 09:15:15 PDT 2021
Author: Roman Lebedev
Date: 2021-09-17T19:13:33+03:00
New Revision: 358df06f4e8d64ba5ad3e80c8f0d763cbb8c0065
URL: https://github.com/llvm/llvm-project/commit/358df06f4e8d64ba5ad3e80c8f0d763cbb8c0065
DIFF: https://github.com/llvm/llvm-project/commit/358df06f4e8d64ba5ad3e80c8f0d763cbb8c0065.diff
LOG: [X86] Improve `matchBinaryShuffle()`'s `BLEND` lowering with per-element all-zero/all-ones knowledge
We can use `OR` instead of `BLEND` if either the element we are not picking is zero (or masked away);
or the element we are picking overwhelms (e.g. it's all-ones) whatever the element we are not picking:
https://alive2.llvm.org/ce/z/RKejao
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D109726
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/insertelement-ones.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ed153256ce29..09ba7af6e38a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36226,12 +36226,58 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
IsBlend = false;
break;
}
- if (IsBlend &&
- DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
- DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
- Shuffle = ISD::OR;
- SrcVT = DstVT = MaskVT.changeTypeToInteger();
- return true;
+ if (IsBlend) {
+ if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+ DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+ Shuffle = ISD::OR;
+ SrcVT = DstVT = MaskVT.changeTypeToInteger();
+ return true;
+ }
+ if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
+ // FIXME: handle mismatched sizes?
+ // TODO: investigate if `ISD::OR` handling in
+ // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
+ auto computeKnownBitsElementWise = [&DAG](SDValue V) {
+ unsigned NumElts = V.getValueType().getVectorNumElements();
+ KnownBits Known(NumElts);
+ for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
+ APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
+ KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
+ if (PeepholeKnown.isZero())
+ Known.Zero.setBit(EltIdx);
+ if (PeepholeKnown.isAllOnes())
+ Known.One.setBit(EltIdx);
+ }
+ return Known;
+ };
+
+ KnownBits V1Known = computeKnownBitsElementWise(V1);
+ KnownBits V2Known = computeKnownBitsElementWise(V2);
+
+ for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ if (M == SM_SentinelZero) {
+ IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
+ continue;
+ }
+ if (M == (int)i) {
+ IsBlend &= V2Known.Zero[i] || V1Known.One[i];
+ continue;
+ }
+ if (M == (int)(i + NumMaskElts)) {
+ IsBlend &= V1Known.Zero[i] || V2Known.One[i];
+ continue;
+ }
+ llvm_unreachable("will not get here.");
+ }
+ if (IsBlend) {
+ Shuffle = ISD::OR;
+ SrcVT = DstVT = MaskVT.changeTypeToInteger();
+ return true;
+ }
+ }
}
}
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index f8eca3608d81..04c02f04f816 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -280,11 +280,8 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -315,26 +312,22 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: pandn %xmm2, %xmm1
-; SSE3-NEXT: por %xmm1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE3-NEXT: por %xmm1, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: retq
;
@@ -344,7 +337,7 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
@@ -372,41 +365,31 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm3
; SSE3-NEXT: pandn %xmm3, %xmm2
-; SSE3-NEXT: por %xmm2, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; SSE3-NEXT: por %xmm4, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: por %xmm4, %xmm2
+; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE3-NEXT: por %xmm4, %xmm3
; SSE3-NEXT: por %xmm3, %xmm1
-; SSE3-NEXT: pand %xmm2, %xmm1
-; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
@@ -415,15 +398,13 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u]
-; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
+; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
-; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
More information about the llvm-commits
mailing list