[llvm] 00bccfc - [X86] bitcnt-big-integer.ll - add additional test coverage where the source values are bitcast from vectors (#171481)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 10:16:07 PST 2025
Author: Simon Pilgrim
Date: 2025-12-09T18:16:03Z
New Revision: 00bccfca7cc4aacbbef6127a411c22c0e08bc466
URL: https://github.com/llvm/llvm-project/commit/00bccfca7cc4aacbbef6127a411c22c0e08bc466
DIFF: https://github.com/llvm/llvm-project/commit/00bccfca7cc4aacbbef6127a411c22c0e08bc466.diff
LOG: [X86] bitcnt-big-integer.ll - add additional test coverage where the source values are bitcast from vectors (#171481)
Added:
Modified:
llvm/test/CodeGen/X86/bitcnt-big-integer.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 0fd555991ae29..749b3ddc96d0d 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -52,6 +52,63 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: popcntq %rcx, %rcx
+; SSE-NEXT: popcntq %rax, %rax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: popcntq %rax, %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: popcntq %rcx, %rax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vmovq %xmm0, %rcx
+; AVX512F-NEXT: popcntq %rax, %rdx
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: popcntq %rcx, %rcx
+; AVX512VL-NEXT: popcntq %rax, %rax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT: popcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i256(i256 %a0) nounwind {
; CHECK-LABEL: test_ctpop_i256:
; CHECK: # %bb.0:
@@ -183,6 +240,107 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: popcntq %rsi, %rsi
+; SSE-NEXT: popcntq %rdx, %rdx
+; SSE-NEXT: addl %esi, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: popcntq %rax, %rsi
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: popcntq %rcx, %rax
+; SSE-NEXT: addl %esi, %eax
+; SSE-NEXT: addl %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: popcntq %rdx, %rdx
+; AVX2-NEXT: popcntq %rsi, %rsi
+; AVX2-NEXT: addl %edx, %esi
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: popcntq %rax, %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: popcntq %rcx, %rax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: addl %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vmovq %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: popcntq %rdx, %rdx
+; AVX512F-NEXT: popcntq %rsi, %rsi
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: popcntq %rax, %rdx
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vmovq %xmm0, %rcx
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: popcntq %rsi, %rsi
+; AVX512VL-NEXT: popcntq %rdx, %rdx
+; AVX512VL-NEXT: addl %esi, %edx
+; AVX512VL-NEXT: xorl %esi, %esi
+; AVX512VL-NEXT: popcntq %rax, %rsi
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rcx, %rax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rcx
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT: popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT: addl %esi, %edx
+; AVX512POPCNT-NEXT: xorl %esi, %esi
+; AVX512POPCNT-NEXT: popcntq %rax, %rsi
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rcx, %rax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i512(i512 %a0) nounwind {
; CHECK-LABEL: test_ctpop_i512:
; CHECK: # %bb.0:
@@ -404,6 +562,166 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: pextrq $1, %xmm2, %rdi
+; SSE-NEXT: movq %xmm2, %r8
+; SSE-NEXT: movq %xmm3, %r9
+; SSE-NEXT: pextrq $1, %xmm3, %r10
+; SSE-NEXT: popcntq %r10, %r10
+; SSE-NEXT: popcntq %r9, %r9
+; SSE-NEXT: addl %r10d, %r9d
+; SSE-NEXT: popcntq %rdi, %rdi
+; SSE-NEXT: popcntq %r8, %r8
+; SSE-NEXT: addl %edi, %r8d
+; SSE-NEXT: addl %r9d, %r8d
+; SSE-NEXT: popcntq %rsi, %rsi
+; SSE-NEXT: popcntq %rdx, %rdx
+; SSE-NEXT: addl %esi, %edx
+; SSE-NEXT: popcntq %rcx, %rcx
+; SSE-NEXT: popcntq %rax, %rax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: addl %edx, %eax
+; SSE-NEXT: addl %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %r9
+; AVX2-NEXT: vmovq %xmm0, %r10
+; AVX2-NEXT: popcntq %r9, %r9
+; AVX2-NEXT: popcntq %r10, %r10
+; AVX2-NEXT: addl %r9d, %r10d
+; AVX2-NEXT: popcntq %rdi, %rdi
+; AVX2-NEXT: popcntq %r8, %r8
+; AVX2-NEXT: addl %edi, %r8d
+; AVX2-NEXT: addl %r10d, %r8d
+; AVX2-NEXT: popcntq %rsi, %rsi
+; AVX2-NEXT: popcntq %rdx, %rdx
+; AVX2-NEXT: addl %esi, %edx
+; AVX2-NEXT: popcntq %rcx, %rcx
+; AVX2-NEXT: popcntq %rax, %rax
+; AVX2-NEXT: addl %ecx, %eax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: addl %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX512F-NEXT: vmovq %xmm1, %r8
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT: vmovq %xmm0, %r10
+; AVX512F-NEXT: popcntq %r9, %r9
+; AVX512F-NEXT: popcntq %r10, %r10
+; AVX512F-NEXT: addl %r9d, %r10d
+; AVX512F-NEXT: popcntq %rdi, %rdi
+; AVX512F-NEXT: popcntq %r8, %r8
+; AVX512F-NEXT: addl %edi, %r8d
+; AVX512F-NEXT: addl %r10d, %r8d
+; AVX512F-NEXT: popcntq %rdx, %rdx
+; AVX512F-NEXT: popcntq %rsi, %rsi
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: popcntq %rcx, %rcx
+; AVX512F-NEXT: popcntq %rax, %rax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT: vmovq %xmm1, %rdi
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %r9
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT: popcntq %r10, %r10
+; AVX512VL-NEXT: popcntq %r9, %r9
+; AVX512VL-NEXT: addl %r10d, %r9d
+; AVX512VL-NEXT: popcntq %r8, %r8
+; AVX512VL-NEXT: popcntq %rdi, %rdi
+; AVX512VL-NEXT: addl %r8d, %edi
+; AVX512VL-NEXT: addl %r9d, %edi
+; AVX512VL-NEXT: popcntq %rdx, %rdx
+; AVX512VL-NEXT: popcntq %rsi, %rsi
+; AVX512VL-NEXT: addl %edx, %esi
+; AVX512VL-NEXT: popcntq %rcx, %rcx
+; AVX512VL-NEXT: popcntq %rax, %rax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %edi, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rax
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rdi
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %r9
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %r10
+; AVX512POPCNT-NEXT: popcntq %r10, %r10
+; AVX512POPCNT-NEXT: popcntq %r9, %r9
+; AVX512POPCNT-NEXT: addl %r10d, %r9d
+; AVX512POPCNT-NEXT: popcntq %r8, %r8
+; AVX512POPCNT-NEXT: popcntq %rdi, %rdi
+; AVX512POPCNT-NEXT: addl %r8d, %edi
+; AVX512POPCNT-NEXT: addl %r9d, %edi
+; AVX512POPCNT-NEXT: popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT: popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT: addl %edx, %esi
+; AVX512POPCNT-NEXT: popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT: popcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %edi, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctpop_i1024:
; SSE: # %bb.0:
@@ -969,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: bsrq %rdx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: lzcntq %rcx, %rdx
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: lzcntq %rcx, %rdx
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_i256(i256 %a0) nounwind {
; SSE-LABEL: test_ctlz_i256:
; SSE: # %bb.0:
@@ -1125,6 +1512,135 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: bsrq %rsi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rax, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %r8d
+; SSE-NEXT: bsrq %rdx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: lzcntq %rsi, %rdi
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %rcx, %rdi
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT: lzcntq %rsi, %rdi
+; AVX512F-NEXT: lzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: lzcntq %rcx, %rdi
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: lzcntq %rsi, %rdi
+; AVX512VL-NEXT: lzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: lzcntq %rcx, %rdi
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_i512(i512 %a0) nounwind {
; SSE-LABEL: test_ctlz_i512:
; SSE: # %bb.0:
@@ -1423,10 +1939,155 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rdi
+; SSE-NEXT: movq %xmm2, %rsi
+; SSE-NEXT: movq %xmm3, %r8
+; SSE-NEXT: pextrq $1, %xmm3, %r9
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq %r8, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %r8d
+; SSE-NEXT: bsrq %rdi, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq %rsi, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq %xmm1, %rdi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: ptest %xmm3, %xmm3
+; SSE-NEXT: cmovnel %r8d, %esi
+; SSE-NEXT: bsrq %rax, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq %rdi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: orl $64, %edi
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %r8d, %edi
+; SSE-NEXT: bsrq %rcx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rdx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm2, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vmovq %xmm1, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm1, %r9
+; AVX2-NEXT: lzcntq %rax, %r10
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
+; AVX2-NEXT: lzcntq %rdi, %rdi
+; AVX2-NEXT: addl $64, %edi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %edi
+; AVX2-NEXT: subl $-128, %edi
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: cmovnel %r11d, %edi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq %rsi, %r9
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm1, %ymm1
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1435,10 +2096,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL-LABEL: vector_ctlz_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -1448,10 +2109,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
-; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT-LABEL: vector_ctlz_i512:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -1460,7 +2121,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
- %a0 = load i512, ptr %p0
+ %a0 = bitcast <16 x i32> %v0 to i512
%cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
@@ -2312,6 +2973,74 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: bsrq %rcx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: bsrq %rax, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: lzcntq %rcx, %rdx
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: lzcntq %rcx, %rdx
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
; SSE-LABEL: test_ctlz_undef_i256:
; SSE: # %bb.0:
@@ -2463,6 +3192,134 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: bsrq %rsi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rdx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: bsrq %rcx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rax, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: lzcntq %rsi, %rdi
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %rcx, %rdi
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT: lzcntq %rsi, %rdi
+; AVX512F-NEXT: lzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: lzcntq %rcx, %rdi
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: lzcntq %rsi, %rdi
+; AVX512VL-NEXT: lzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: lzcntq %rcx, %rdi
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
; SSE-LABEL: test_ctlz_undef_i512:
; SSE: # %bb.0:
@@ -2796,6 +3653,147 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rsi
+; SSE-NEXT: movq %xmm2, %rdx
+; SSE-NEXT: movq %xmm3, %rdi
+; SSE-NEXT: pextrq $1, %xmm3, %r8
+; SSE-NEXT: bsrq %r8, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq %rdi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: orl $64, %edi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %edi
+; SSE-NEXT: bsrq %rsi, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq %rdx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r8d, %edx
+; SSE-NEXT: movq %xmm0, %rsi
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: ptest %xmm3, %xmm3
+; SSE-NEXT: movq %xmm1, %r8
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: bsrq %rax, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %r8, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %edi, %r8d
+; SSE-NEXT: bsrq %rcx, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rsi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm2, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vmovq %xmm1, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm1, %r9
+; AVX2-NEXT: lzcntq %rax, %r10
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
+; AVX2-NEXT: lzcntq %rdi, %rdi
+; AVX2-NEXT: addl $64, %edi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %edi
+; AVX2-NEXT: subl $-128, %edi
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: cmovnel %r11d, %edi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq %rsi, %r9
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm1, %ymm1
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctlz_undef_i1024:
; SSE: # %bb.0:
@@ -3636,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vector_cttz_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: tzcntq %rcx, %rdx
+; AVX512-NEXT: tzcntq %rax, %rax
+; AVX512-NEXT: addl $64, %eax
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: cmovnel %edx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_i256(i256 %a0) nounwind {
; SSE-LABEL: test_cttz_i256:
; SSE: # %bb.0:
@@ -3775,21 +4816,146 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
-; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i256, ptr %p0
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm1, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: rep bsfq %rax, %rdi
+; SSE-NEXT: addl $64, %edi
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %edi
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: tzcntq %rsi, %rdi
+; AVX2-NEXT: tzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rcx, %rdi
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vmovq %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: tzcntq %rsi, %rdi
+; AVX512F-NEXT: tzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: tzcntq %rcx, %rdi
+; AVX512F-NEXT: tzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rdx, %rsi
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vmovq %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: tzcntq %rsi, %rdi
+; AVX512VL-NEXT: tzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: tzcntq %rcx, %rdi
+; AVX512VL-NEXT: tzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rdx, %rsi
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i256:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
-; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
-; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
-; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rdx, %rsi
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512POPCNT-NEXT: vzeroupper
; AVX512POPCNT-NEXT: retq
- %a0 = load i256, ptr %p0
+ %a0 = bitcast <8 x i32> %v0 to i256
%cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
%res = trunc i256 %cnt to i32
ret i32 %res
@@ -4128,6 +5294,148 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm3, %rdx
+; SSE-NEXT: movq %xmm3, %rcx
+; SSE-NEXT: pextrq $1, %xmm2, %rax
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: movq %xmm1, %rdi
+; SSE-NEXT: pextrq $1, %xmm0, %r8
+; SSE-NEXT: movq %xmm0, %r9
+; SSE-NEXT: rep bsfq %r9, %r10
+; SSE-NEXT: rep bsfq %r8, %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %r8d
+; SSE-NEXT: rep bsfq %rdi, %r9
+; SSE-NEXT: rep bsfq %rsi, %rsi
+; SSE-NEXT: addl $64, %esi
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq %xmm2, %rdi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %r8d, %esi
+; SSE-NEXT: rep bsfq %rdi, %r8
+; SSE-NEXT: rep bsfq %rax, %r9
+; SSE-NEXT: addl $64, %r9d
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r8d, %r9d
+; SSE-NEXT: rep bsfq %rcx, %rdi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vmovq %xmm0, %r9
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: tzcntq %rdi, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r8, %r10
+; AVX2-NEXT: tzcntq %rsi, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %rdi, %r9
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rdx, %rdi
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: vmovq %xmm2, %rdi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: tzcntq %rdi, %r9
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_cttz_i1024:
; SSE: # %bb.0:
@@ -4930,6 +6238,48 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: rep bsfq %rcx, %rdx
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vector_cttz_undef_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: tzcntq %rcx, %rdx
+; AVX512-NEXT: tzcntq %rax, %rax
+; AVX512-NEXT: addl $64, %eax
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: cmovnel %edx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
; SSE-LABEL: test_cttz_undef_i256:
; SSE: # %bb.0:
@@ -5084,6 +6434,130 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: movq %xmm0, %rsi
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rdx, %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: rep bsfq %rcx, %rsi
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: tzcntq %rsi, %rdi
+; AVX2-NEXT: tzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rcx, %rdi
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vmovq %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: tzcntq %rsi, %rdi
+; AVX512F-NEXT: tzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: tzcntq %rcx, %rdi
+; AVX512F-NEXT: tzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rdx, %rsi
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vmovq %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: tzcntq %rsi, %rdi
+; AVX512VL-NEXT: tzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: tzcntq %rcx, %rdi
+; AVX512VL-NEXT: tzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rdx, %rsi
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rdx, %rsi
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
; SSE-LABEL: test_cttz_undef_i512:
; SSE: # %bb.0:
@@ -5409,6 +6883,144 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm3, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rcx
+; SSE-NEXT: movq %xmm1, %rsi
+; SSE-NEXT: pextrq $1, %xmm0, %rdi
+; SSE-NEXT: movq %xmm0, %r8
+; SSE-NEXT: rep bsfq %r8, %r9
+; SSE-NEXT: rep bsfq %rdi, %rdi
+; SSE-NEXT: addl $64, %edi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %edi
+; SSE-NEXT: rep bsfq %rsi, %r8
+; SSE-NEXT: rep bsfq %rcx, %rcx
+; SSE-NEXT: addl $64, %ecx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r8d, %ecx
+; SSE-NEXT: movq %xmm2, %rsi
+; SSE-NEXT: subl $-128, %ecx
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edi, %ecx
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rdx, %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: movq %xmm3, %rsi
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vmovq %xmm0, %r9
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: tzcntq %rdi, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r8, %r10
+; AVX2-NEXT: tzcntq %rsi, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %rdi, %r9
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rdx, %rdi
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: vmovq %xmm2, %rdi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: tzcntq %rdi, %r9
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_cttz_undef_i1024:
; SSE: # %bb.0:
More information about the llvm-commits
mailing list