[llvm] ae33cbc - [X86][SSE] LowerVectorAllZeroTest - add support for >256-bit vectors
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 15 07:30:36 PDT 2020
Author: Simon Pilgrim
Date: 2020-06-15T15:30:24+01:00
New Revision: ae33cbc49408c90cef0b1246a7bae59bd467c93b
URL: https://github.com/llvm/llvm-project/commit/ae33cbc49408c90cef0b1246a7bae59bd467c93b
DIFF: https://github.com/llvm/llvm-project/commit/ae33cbc49408c90cef0b1246a7bae59bd467c93b.diff
LOG: [X86][SSE] LowerVectorAllZeroTest - add support for >256-bit vectors
Reduce by splitting the vector until we reach the target size for PTEST/MOVMSK_PCMPEQ. There might be some cases where AVX512 can perform this with 512-bit vectors but so far I haven't encountered any such pattern that reaches LowerVectorAllZeroTest.
Prep work for D81547
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/ptest.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 83bc40ace2ab..b80c94661d74 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21366,9 +21366,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
}) &&
"Reduction source vector mismatch");
- // Quit if not 128/256-bit vector.
+ // Quit if less than 128-bits or not splittable to 128/256-bit vector.
EVT VT = VecIns[0].getValueType();
- if (!VT.is128BitVector() && !VT.is256BitVector())
+ if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
return SDValue();
SDLoc DL(Op);
@@ -21382,18 +21382,28 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
}
+ SDValue V = VecIns.back();
+
+ // Split down to 128/256-bit vector.
+ unsigned TestSize = Subtarget.hasAVX()? 256 : 128;
+ while (VT.getSizeInBits() > TestSize) {
+ auto Split = DAG.SplitVector(V, DL);
+ VT = Split.first.getValueType();
+ V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
+ }
+
X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
DL, MVT::i8);
bool UsePTEST = Subtarget.hasSSE41();
if (UsePTEST) {
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
- SDValue V = DAG.getBitcast(TestVT, VecIns.back());
+ V = DAG.getBitcast(TestVT, V);
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
}
SDValue Result = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8,
- DAG.getBitcast(MVT::v16i8, VecIns.back()),
+ DAG.getBitcast(MVT::v16i8, V),
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 13ca7195bca3..a93a03fea670 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -148,24 +148,9 @@ define i32 @veccond512(<16 x i32> %input) {
;
; AVX512-LABEL: veccond512:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: orq %rcx, %rax
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: orq %rcx, %rdx
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: je .LBB2_2
; AVX512-NEXT: # %bb.1: # %if-true-block
; AVX512-NEXT: xorl %eax, %eax
@@ -283,25 +268,10 @@ define i32 @vectest512(<16 x i32> %input) {
;
; AVX512-LABEL: vectest512:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: orq %rcx, %rax
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT: orq %rax, %rdx
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: orq %rcx, %rdx
+; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -410,24 +380,9 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
; AVX512-LABEL: vecsel512:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %edi, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdx
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512-NEXT: vmovq %xmm3, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: vmovq %xmm0, %rdx
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
-; AVX512-NEXT: orq %rcx, %rdi
-; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX512-NEXT: orq %rdi, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX512-NEXT: orq %rcx, %rdi
-; AVX512-NEXT: orq %rdx, %rdi
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: cmovel %esi, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
More information about the llvm-commits
mailing list