[llvm] c7ba5a9 - [X86][SSE] Add initial support for extracting non-constant bool vector elements
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 19 06:35:42 PDT 2022
Author: Simon Pilgrim
Date: 2022-03-19T13:31:05Z
New Revision: c7ba5a9affbcabb0d05301e5417c203274667572
URL: https://github.com/llvm/llvm-project/commit/c7ba5a9affbcabb0d05301e5417c203274667572
DIFF: https://github.com/llvm/llvm-project/commit/c7ba5a9affbcabb0d05301e5417c203274667572.diff
LOG: [X86][SSE] Add initial support for extracting non-constant bool vector elements
We can use MOVMSK+TEST/BT to extract individual bool elements even if the index isn't constant
This relies on combineBitcastvxi1 so some AVX512 cases still aren't optimized as they avoid MOVMSK usage.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/movmsk-cmp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f3f78a624d69d..0ba6fa26c7437 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43333,29 +43333,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// but not
// i1 = extract_vector_elt t0:1, Constant:i64<2>
// since the latter would need its own MOVMSK.
- if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ if (SrcVT.getScalarType() == MVT::i1) {
+ bool IsVar = !CIdx;
SmallVector<SDNode *, 16> BoolExtracts;
unsigned ResNo = InputVector.getResNo();
- auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
+ auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isa<ConstantSDNode>(Use->getOperand(1)) &&
Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
+ IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
return true;
}
return false;
};
+ // TODO: Can we drop the oneuse check for constant extracts?
if (all_of(InputVector->uses(), IsBoolExtract) &&
- BoolExtracts.size() > 1) {
+ (IsVar || BoolExtracts.size() > 1)) {
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
for (SDNode *Use : BoolExtracts) {
// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
- unsigned MaskIdx = Use->getConstantOperandVal(1);
- APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
- SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ // Mask = 1 << MaskIdx
+ SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
+ SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
+ SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
DCI.CombineTo(Use, Res);
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 78bb6d5f1a635..5e0318b1984e4 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1633,23 +1633,13 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa %ymm0, (%rsp)
-; KNL-NEXT: andl $31, %edi
-; KNL-NEXT: movzbl (%rsp,%rdi), %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vpmovmskb %ymm0, %ecx
+; KNL-NEXT: notl %ecx
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: btl %edi, %ecx
+; KNL-NEXT: setb %al
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 955266a782c40..70a086e96e6e4 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4264,33 +4264,26 @@ define i1 @movmsk_or_v2f64(<2 x double> %x, <2 x double> %y) {
define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
; SSE-LABEL: movmsk_v16i8_var:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $15, %edi
-; SSE-NEXT: movb -24(%rsp,%rdi), %al
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: btl %edi, %eax
+; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v16i8_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $15, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi), %al
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v16i8_var:
; KNL: # %bb.0:
-; KNL-NEXT: # kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movb -24(%rsp,%rdi), %al
-; KNL-NEXT: vzeroupper
+; KNL-NEXT: vpmovmskb %xmm0, %eax
+; KNL-NEXT: btl %edi, %eax
+; KNL-NEXT: setb %al
; KNL-NEXT: retq
;
; SKX-LABEL: movmsk_v16i8_var:
@@ -4310,20 +4303,20 @@ define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
; SSE-LABEL: movmsk_v8i16_var:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $7, %edi
-; SSE-NEXT: movb -24(%rsp,%rdi,2), %al
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: btl %edi, %eax
+; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v8i16_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $7, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi,2), %al
+; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v8i16_var:
@@ -4357,20 +4350,18 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
; SSE-LABEL: movmsk_v4i32_var:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movb -24(%rsp,%rdi,4), %al
+; SSE-NEXT: movmskps %xmm1, %eax
+; SSE-NEXT: btl %edi, %eax
+; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v4i32_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $3, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al
+; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v4i32_var:
@@ -4403,37 +4394,31 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
; SSE2-LABEL: movmsk_v2i64_var:
; SSE2: # %bb.0:
-; SSE2-NEXT: # kill: def $edi killed $edi def $rdi
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: andl $1, %edi
-; SSE2-NEXT: movb -24(%rsp,%rdi,8), %al
+; SSE2-NEXT: movmskpd %xmm1, %eax
+; SSE2-NEXT: xorl $3, %eax
+; SSE2-NEXT: btl %edi, %eax
+; SSE2-NEXT: setb %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: movmsk_v2i64_var:
; SSE41: # %bb.0:
-; SSE41-NEXT: # kill: def $edi killed $edi def $rdi
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: andl $1, %edi
-; SSE41-NEXT: movb -24(%rsp,%rdi,8), %al
+; SSE41-NEXT: movmskpd %xmm0, %eax
+; SSE41-NEXT: xorl $3, %eax
+; SSE41-NEXT: btl %edi, %eax
+; SSE41-NEXT: setb %al
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v2i64_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $1, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al
+; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax
+; AVX1OR2-NEXT: xorl $3, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v2i64_var:
@@ -4466,23 +4451,21 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
; SSE-LABEL: movmsk_v4f32_var:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: cmpeqps %xmm1, %xmm2
; SSE-NEXT: cmpunordps %xmm1, %xmm0
; SSE-NEXT: orps %xmm2, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movb -24(%rsp,%rdi,4), %al
+; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: btl %edi, %eax
+; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v4f32_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vcmpeq_uqps %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $3, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al
+; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v4f32_var:
@@ -4515,20 +4498,18 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) {
; SSE-LABEL: movmsk_v2f64_var:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: cmplepd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $1, %edi
-; SSE-NEXT: movb -24(%rsp,%rdi,8), %al
+; SSE-NEXT: movmskpd %xmm1, %eax
+; SSE-NEXT: btl %edi, %eax
+; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v2f64_var:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: vmovapd %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: andl $1, %edi
-; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al
+; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax
+; AVX1OR2-NEXT: btl %edi, %eax
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v2f64_var:
More information about the llvm-commits
mailing list