[llvm] [X86] Use vectorized i256 bit counts when we know the source originated from the vector unit (PR #171589)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 10 02:14:10 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Currently we only permit i256 CTTZ/CTLZ AVX512 lowering when the source is loadable as GPR->FPU transition costs would outweigh the vectorization benefit.

This patch checks for other cases where the source can avoid the GPR - a mayFoldToVector helper checks for a bitcast originally from a vector type, as well as constant values and the original mayFoldLoad check.

There will be other cases for the mayFoldToVector helper, but I've just used this for CTTZ/CTLZ initially.

---
Full diff: https://github.com/llvm/llvm-project/pull/171589.diff


2 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+10-1) 
- (modified) llvm/test/CodeGen/X86/bitcnt-big-integer.ll (+104-228) 


``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fbd875a93fd4a..b4ad7465d612e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2846,6 +2846,15 @@ bool X86::mayFoldIntoZeroExtend(SDValue Op) {
   return false;
 }
 
+// Return true if its cheap to bitcast this to a vector type.
+static bool mayFoldToVector(SDValue Op, const X86Subtarget &Subtarget) {
+  if (peekThroughBitcasts(Op).getValueType().isVector())
+    return true;
+  if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
+    return true;
+  return X86::mayFoldLoad(Op, Subtarget);
+}
+
 static bool isLogicOp(unsigned Opcode) {
   // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
   return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
@@ -33958,7 +33967,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT VT = N->getValueType(0);
     assert(Subtarget.hasCDI() && "AVX512CD required");
     assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!");
-    if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget))
+    if (VT == MVT::i256 && !mayFoldToVector(N0, Subtarget))
       return;
 
     unsigned SizeInBits = VT.getSizeInBits();
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 749b3ddc96d0d..06ccbf4daa1e8 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1567,72 +1567,38 @@ define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
 ;
 ; AVX512F-LABEL: vector_ctlz_i256:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rdx
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT:    lzcntq %rsi, %rdi
-; AVX512F-NEXT:    lzcntq %rdx, %r8
-; AVX512F-NEXT:    addl $64, %r8d
-; AVX512F-NEXT:    testq %rsi, %rsi
-; AVX512F-NEXT:    cmovnel %edi, %r8d
-; AVX512F-NEXT:    lzcntq %rcx, %rdi
-; AVX512F-NEXT:    lzcntq %rax, %rax
-; AVX512F-NEXT:    addl $64, %eax
-; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    cmovnel %edi, %eax
-; AVX512F-NEXT:    subl $-128, %eax
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    cmovnel %r8d, %eax
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm2
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vector_ctlz_i256:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq %xmm0, %rdx
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT:    lzcntq %rsi, %rdi
-; AVX512VL-NEXT:    lzcntq %rdx, %r8
-; AVX512VL-NEXT:    addl $64, %r8d
-; AVX512VL-NEXT:    testq %rsi, %rsi
-; AVX512VL-NEXT:    cmovnel %edi, %r8d
-; AVX512VL-NEXT:    lzcntq %rcx, %rdi
-; AVX512VL-NEXT:    lzcntq %rax, %rax
-; AVX512VL-NEXT:    addl $64, %eax
-; AVX512VL-NEXT:    testq %rcx, %rcx
-; AVX512VL-NEXT:    cmovnel %edi, %eax
-; AVX512VL-NEXT:    subl $-128, %eax
-; AVX512VL-NEXT:    orq %rsi, %rdx
-; AVX512VL-NEXT:    cmovnel %r8d, %eax
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT:    vplzcntq %ymm0, %ymm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: vector_ctlz_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
-; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
-; AVX512POPCNT-NEXT:    addl $64, %r8d
-; AVX512POPCNT-NEXT:    testq %rsi, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
-; AVX512POPCNT-NEXT:    addl $64, %eax
-; AVX512POPCNT-NEXT:    testq %rcx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
-; AVX512POPCNT-NEXT:    subl $-128, %eax
-; AVX512POPCNT-NEXT:    orq %rsi, %rdx
-; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512POPCNT-NEXT:    vplzcntq %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
   %a0 = bitcast <8 x i32> %v0 to i256
@@ -3246,72 +3212,35 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
 ;
 ; AVX512F-LABEL: vector_ctlz_undef_i256:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rdx
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT:    lzcntq %rsi, %rdi
-; AVX512F-NEXT:    lzcntq %rdx, %r8
-; AVX512F-NEXT:    addl $64, %r8d
-; AVX512F-NEXT:    testq %rsi, %rsi
-; AVX512F-NEXT:    cmovnel %edi, %r8d
-; AVX512F-NEXT:    lzcntq %rcx, %rdi
-; AVX512F-NEXT:    lzcntq %rax, %rax
-; AVX512F-NEXT:    addl $64, %eax
-; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    cmovnel %edi, %eax
-; AVX512F-NEXT:    subl $-128, %eax
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    cmovnel %r8d, %eax
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vector_ctlz_undef_i256:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq %xmm0, %rdx
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT:    lzcntq %rsi, %rdi
-; AVX512VL-NEXT:    lzcntq %rdx, %r8
-; AVX512VL-NEXT:    addl $64, %r8d
-; AVX512VL-NEXT:    testq %rsi, %rsi
-; AVX512VL-NEXT:    cmovnel %edi, %r8d
-; AVX512VL-NEXT:    lzcntq %rcx, %rdi
-; AVX512VL-NEXT:    lzcntq %rax, %rax
-; AVX512VL-NEXT:    addl $64, %eax
-; AVX512VL-NEXT:    testq %rcx, %rcx
-; AVX512VL-NEXT:    cmovnel %edi, %eax
-; AVX512VL-NEXT:    subl $-128, %eax
-; AVX512VL-NEXT:    orq %rsi, %rdx
-; AVX512VL-NEXT:    cmovnel %r8d, %eax
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
-; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
-; AVX512POPCNT-NEXT:    addl $64, %r8d
-; AVX512POPCNT-NEXT:    testq %rsi, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
-; AVX512POPCNT-NEXT:    addl $64, %eax
-; AVX512POPCNT-NEXT:    testq %rcx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
-; AVX512POPCNT-NEXT:    subl $-128, %eax
-; AVX512POPCNT-NEXT:    orq %rsi, %rdx
-; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
   %a0 = bitcast <8 x i32> %v0 to i256
@@ -4887,72 +4816,47 @@ define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
 ;
 ; AVX512F-LABEL: vector_cttz_i256:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vmovq %xmm1, %rcx
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT:    vmovq %xmm0, %rsi
-; AVX512F-NEXT:    tzcntq %rsi, %rdi
-; AVX512F-NEXT:    tzcntq %rdx, %r8
-; AVX512F-NEXT:    addl $64, %r8d
-; AVX512F-NEXT:    testq %rsi, %rsi
-; AVX512F-NEXT:    cmovnel %edi, %r8d
-; AVX512F-NEXT:    tzcntq %rcx, %rdi
-; AVX512F-NEXT:    tzcntq %rax, %rax
-; AVX512F-NEXT:    addl $64, %eax
-; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    cmovnel %edi, %eax
-; AVX512F-NEXT:    subl $-128, %eax
-; AVX512F-NEXT:    orq %rdx, %rsi
-; AVX512F-NEXT:    cmovnel %r8d, %eax
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256]
+; AVX512F-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpandn %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [64,128,192,256]
+; AVX512F-NEXT:    vpsubq %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vector_cttz_i256:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %xmm1, %rcx
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT:    vmovq %xmm0, %rsi
-; AVX512VL-NEXT:    tzcntq %rsi, %rdi
-; AVX512VL-NEXT:    tzcntq %rdx, %r8
-; AVX512VL-NEXT:    addl $64, %r8d
-; AVX512VL-NEXT:    testq %rsi, %rsi
-; AVX512VL-NEXT:    cmovnel %edi, %r8d
-; AVX512VL-NEXT:    tzcntq %rcx, %rdi
-; AVX512VL-NEXT:    tzcntq %rax, %rax
-; AVX512VL-NEXT:    addl $64, %eax
-; AVX512VL-NEXT:    testq %rcx, %rcx
-; AVX512VL-NEXT:    cmovnel %edi, %eax
-; AVX512VL-NEXT:    subl $-128, %eax
-; AVX512VL-NEXT:    orq %rdx, %rsi
-; AVX512VL-NEXT:    cmovnel %r8d, %eax
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: vector_cttz_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
-; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
-; AVX512POPCNT-NEXT:    addl $64, %r8d
-; AVX512POPCNT-NEXT:    testq %rsi, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
-; AVX512POPCNT-NEXT:    addl $64, %eax
-; AVX512POPCNT-NEXT:    testq %rcx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
-; AVX512POPCNT-NEXT:    subl $-128, %eax
-; AVX512POPCNT-NEXT:    orq %rdx, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
   %a0 = bitcast <8 x i32> %v0 to i256
@@ -6484,72 +6388,44 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
 ;
 ; AVX512F-LABEL: vector_cttz_undef_i256:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vmovq %xmm1, %rcx
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT:    vmovq %xmm0, %rsi
-; AVX512F-NEXT:    tzcntq %rsi, %rdi
-; AVX512F-NEXT:    tzcntq %rdx, %r8
-; AVX512F-NEXT:    addl $64, %r8d
-; AVX512F-NEXT:    testq %rsi, %rsi
-; AVX512F-NEXT:    cmovnel %edi, %r8d
-; AVX512F-NEXT:    tzcntq %rcx, %rdi
-; AVX512F-NEXT:    tzcntq %rax, %rax
-; AVX512F-NEXT:    addl $64, %eax
-; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    cmovnel %edi, %eax
-; AVX512F-NEXT:    subl $-128, %eax
-; AVX512F-NEXT:    orq %rdx, %rsi
-; AVX512F-NEXT:    cmovnel %r8d, %eax
-; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vector_cttz_undef_i256:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %xmm1, %rcx
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT:    vmovq %xmm0, %rsi
-; AVX512VL-NEXT:    tzcntq %rsi, %rdi
-; AVX512VL-NEXT:    tzcntq %rdx, %r8
-; AVX512VL-NEXT:    addl $64, %r8d
-; AVX512VL-NEXT:    testq %rsi, %rsi
-; AVX512VL-NEXT:    cmovnel %edi, %r8d
-; AVX512VL-NEXT:    tzcntq %rcx, %rdi
-; AVX512VL-NEXT:    tzcntq %rax, %rax
-; AVX512VL-NEXT:    addl $64, %eax
-; AVX512VL-NEXT:    testq %rcx, %rcx
-; AVX512VL-NEXT:    cmovnel %edi, %eax
-; AVX512VL-NEXT:    subl $-128, %eax
-; AVX512VL-NEXT:    orq %rdx, %rsi
-; AVX512VL-NEXT:    cmovnel %r8d, %eax
-; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
-; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
-; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
-; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
-; AVX512POPCNT-NEXT:    addl $64, %r8d
-; AVX512POPCNT-NEXT:    testq %rsi, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
-; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
-; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
-; AVX512POPCNT-NEXT:    addl $64, %eax
-; AVX512POPCNT-NEXT:    testq %rcx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
-; AVX512POPCNT-NEXT:    subl $-128, %eax
-; AVX512POPCNT-NEXT:    orq %rdx, %rsi
-; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
-; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
   %a0 = bitcast <8 x i32> %v0 to i256

``````````

</details>


https://github.com/llvm/llvm-project/pull/171589


More information about the llvm-commits mailing list