[llvm] c7ba5a9 - [X86][SSE] Add initial support for extracting non-constant bool vector elements

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 19 06:35:42 PDT 2022


Author: Simon Pilgrim
Date: 2022-03-19T13:31:05Z
New Revision: c7ba5a9affbcabb0d05301e5417c203274667572

URL: https://github.com/llvm/llvm-project/commit/c7ba5a9affbcabb0d05301e5417c203274667572
DIFF: https://github.com/llvm/llvm-project/commit/c7ba5a9affbcabb0d05301e5417c203274667572.diff

LOG: [X86][SSE] Add initial support for extracting non-constant bool vector elements

We can use MOVMSK+TEST/BT to extract individual bool elements even if the index isn't constant

This relies on combineBitcastvxi1 so some AVX512 cases still aren't optimized as they avoid MOVMSK usage.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/test/CodeGen/X86/movmsk-cmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f3f78a624d69d..0ba6fa26c7437 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43333,29 +43333,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   // but not
   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
   // since the latter would need its own MOVMSK.
-  if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+  if (SrcVT.getScalarType() == MVT::i1) {
+    bool IsVar = !CIdx;
     SmallVector<SDNode *, 16> BoolExtracts;
     unsigned ResNo = InputVector.getResNo();
-    auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
+    auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(Use->getOperand(1)) &&
           Use->getOperand(0).getResNo() == ResNo &&
           Use->getValueType(0) == MVT::i1) {
         BoolExtracts.push_back(Use);
+        IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
         return true;
       }
       return false;
     };
+    // TODO: Can we drop the oneuse check for constant extracts?
     if (all_of(InputVector->uses(), IsBoolExtract) &&
-        BoolExtracts.size() > 1) {
+        (IsVar || BoolExtracts.size() > 1)) {
       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
       if (SDValue BC =
               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
         for (SDNode *Use : BoolExtracts) {
           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
-          unsigned MaskIdx = Use->getConstantOperandVal(1);
-          APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
-          SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+          // Mask = 1 << MaskIdx
+          SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
+          SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
+          SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
           DCI.CombineTo(Use, Res);

diff  --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 78bb6d5f1a635..5e0318b1984e4 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1633,23 +1633,13 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
 define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v32i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    .cfi_offset %rbp, -16
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    andq $-32, %rsp
-; KNL-NEXT:    subq $64, %rsp
-; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
 ; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
 ; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
-; KNL-NEXT:    andl $31, %edi
-; KNL-NEXT:    movzbl (%rsp,%rdi), %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vpmovmskb %ymm0, %ecx
+; KNL-NEXT:    notl %ecx
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    btl %edi, %ecx
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 955266a782c40..70a086e96e6e4 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4264,33 +4264,26 @@ define i1 @movmsk_or_v2f64(<2 x double> %x, <2 x double> %y) {
 define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
 ; SSE-LABEL: movmsk_v16i8_var:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $15, %edi
-; SSE-NEXT:    movb -24(%rsp,%rdi), %al
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    btl %edi, %eax
+; SSE-NEXT:    setb %al
 ; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v16i8_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $15, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi), %al
+; AVX1OR2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v16i8_var:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    # kill: def $edi killed $edi def $rdi
 ; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    movb -24(%rsp,%rdi), %al
-; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    vpmovmskb %xmm0, %eax
+; KNL-NEXT:    btl %edi, %eax
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: movmsk_v16i8_var:
@@ -4310,20 +4303,20 @@ define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
 define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
 ; SSE-LABEL: movmsk_v8i16_var:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $7, %edi
-; SSE-NEXT:    movb -24(%rsp,%rdi,2), %al
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    btl %edi, %eax
+; SSE-NEXT:    setb %al
 ; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v8i16_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $7, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi,2), %al
+; AVX1OR2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v8i16_var:
@@ -4357,20 +4350,18 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
 define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
 ; SSE-LABEL: movmsk_v4i32_var:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $3, %edi
-; SSE-NEXT:    movb -24(%rsp,%rdi,4), %al
+; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    btl %edi, %eax
+; SSE-NEXT:    setb %al
 ; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v4i32_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $3, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi,4), %al
+; AVX1OR2-NEXT:    vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v4i32_var:
@@ -4403,37 +4394,31 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
 define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; SSE2-LABEL: movmsk_v2i64_var:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    andl $1, %edi
-; SSE2-NEXT:    movb -24(%rsp,%rdi,8), %al
+; SSE2-NEXT:    movmskpd %xmm1, %eax
+; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    btl %edi, %eax
+; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: movmsk_v2i64_var:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    andl $1, %edi
-; SSE41-NEXT:    movb -24(%rsp,%rdi,8), %al
+; SSE41-NEXT:    movmskpd %xmm0, %eax
+; SSE41-NEXT:    xorl $3, %eax
+; SSE41-NEXT:    btl %edi, %eax
+; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v2i64_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $1, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi,8), %al
+; AVX1OR2-NEXT:    vmovmskpd %xmm0, %eax
+; AVX1OR2-NEXT:    xorl $3, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v2i64_var:
@@ -4466,23 +4451,21 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
 ; SSE-LABEL: movmsk_v4f32_var:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    cmpeqps %xmm1, %xmm2
 ; SSE-NEXT:    cmpunordps %xmm1, %xmm0
 ; SSE-NEXT:    orps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $3, %edi
-; SSE-NEXT:    movb -24(%rsp,%rdi,4), %al
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    btl %edi, %eax
+; SSE-NEXT:    setb %al
 ; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v4f32_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vcmpeq_uqps %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $3, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi,4), %al
+; AVX1OR2-NEXT:    vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v4f32_var:
@@ -4515,20 +4498,18 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
 define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) {
 ; SSE-LABEL: movmsk_v2f64_var:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE-NEXT:    cmplepd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $1, %edi
-; SSE-NEXT:    movb -24(%rsp,%rdi,8), %al
+; SSE-NEXT:    movmskpd %xmm1, %eax
+; SSE-NEXT:    btl %edi, %eax
+; SSE-NEXT:    setb %al
 ; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: movmsk_v2f64_var:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX1OR2-NEXT:    vcmplepd %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    vmovapd %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT:    andl $1, %edi
-; AVX1OR2-NEXT:    movb -24(%rsp,%rdi,8), %al
+; AVX1OR2-NEXT:    vmovmskpd %xmm0, %eax
+; AVX1OR2-NEXT:    btl %edi, %eax
+; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; KNL-LABEL: movmsk_v2f64_var:


        


More information about the llvm-commits mailing list