[llvm] [X86] Add ctpop/ctlz/cttz test coverage for very large integers (PR #164450)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 21 09:03:15 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Inspired by #<!-- -->164275

---

Patch is 103.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164450.diff


1 Files Affected:

- (added) llvm/test/CodeGen/X86/bitcnt-big-integer.ll (+3021) 


``````````diff
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
new file mode 100644
index 0000000000000..13149d78b16fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -0,0 +1,3021 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+
+;
+; CTPOP
+;
+
+define i32 @test_ctpop_i128(i128 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq %rsi, %rcx
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i128(ptr %p0) nounwind {
+; CHECK-LABEL: load_ctpop_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq 8(%rdi), %rcx
+; CHECK-NEXT:    popcntq (%rdi), %rax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i256(i256 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    popcntq %rdx, %rcx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    popcntq %rsi, %rdx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 24(%rdi), %rcx
+; SSE-NEXT:    popcntq 16(%rdi), %rdx
+; SSE-NEXT:    popcntq 8(%rdi), %rsi
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %ecx, %edx
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 24(%rdi), %rax
+; AVX2-NEXT:    popcntq 16(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    popcntq 8(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    popcntq 16(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    popcntq 8(%rdi), %rdx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i512(i512 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    addl %eax, %r10d
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %r9, %rax
+; CHECK-NEXT:    popcntq %r8, %r8
+; CHECK-NEXT:    addl %eax, %r8d
+; CHECK-NEXT:    addl %r10d, %r8d
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    popcntq %rdx, %rcx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    popcntq %rsi, %rdx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %r8d, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 56(%rdi), %rax
+; SSE-NEXT:    popcntq 48(%rdi), %rcx
+; SSE-NEXT:    popcntq 40(%rdi), %rdx
+; SSE-NEXT:    popcntq 32(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 24(%rdi), %rax
+; SSE-NEXT:    addl %ecx, %esi
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq 16(%rdi), %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 8(%rdi), %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 56(%rdi), %rax
+; AVX2-NEXT:    popcntq 48(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 40(%rdi), %rax
+; AVX2-NEXT:    popcntq 32(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq 24(%rdi), %rcx
+; AVX2-NEXT:    popcntq 16(%rdi), %rsi
+; AVX2-NEXT:    popcntq 8(%rdi), %r8
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %ecx, %esi
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 56(%rdi), %rax
+; AVX512-NEXT:    popcntq 48(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 40(%rdi), %rax
+; AVX512-NEXT:    popcntq 32(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    addl %ecx, %edx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq 16(%rdi), %rcx
+; AVX512-NEXT:    popcntq 8(%rdi), %rsi
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %esi, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_ctpop_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    addl %eax, %r10d
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl %r11d, %eax
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %ebx, %ebx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    addl %r10d, %eax
+; SSE-NEXT:    addl %r11d, %ebx
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    addl %r11d, %r10d
+; SSE-NEXT:    addl %ebx, %r10d
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %ebx, %ebx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    addl %eax, %r10d
+; SSE-NEXT:    addl %r11d, %ebx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %r9, %rax
+; SSE-NEXT:    popcntq %r8, %r8
+; SSE-NEXT:    addl %eax, %r8d
+; SSE-NEXT:    addl %ebx, %r8d
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rcx, %rax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq %rdx, %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq %rsi, %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rdi, %rax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    addl %r10d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctpop_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl %eax, %r10d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl %eax, %r11d
+; AVX2-NEXT:    addl %r10d, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    addl %eax, %ebx
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl %r14d, %r10d
+; AVX2-NEXT:    addl %ebx, %r10d
+; AVX2-NEXT:    addl %r11d, %r10d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl %eax, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %r9, %rax
+; AVX2-NEXT:    popcntq %r8, %r8
+; AVX2-NEXT:    addl %eax, %r8d
+; AVX2-NEXT:    addl %r11d, %r8d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq %rdx, %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq %rsi, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rdi, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %r10d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctpop_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addl %eax, %r10d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl %eax, %r11d
+; AVX512-NEXT:    addl %r10d, %r11d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    xorl %ebx, %ebx
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    xorl %r14d, %r14d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    addl %eax, %ebx
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addl %r14d, %r10d
+; AVX512-NEXT:    addl %ebx, %r10d
+; AVX512-NEXT:    addl %r11d, %r10d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl %eax, %r11d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %r9, %rax
+; AVX512-NEXT:    popcntq %r8, %r8
+; AVX512-NEXT:    addl %eax, %r8d
+; AVX512-NEXT:    addl %r11d, %r8d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %rcx, %rax
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq %rdx, %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq %rsi, %rdx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %rdi, %rax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    addl %r8d, %eax
+; AVX512-NEXT:    addl %r10d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 120(%rdi), %rax
+; SSE-NEXT:    popcntq 112(%rdi), %rcx
+; SSE-NEXT:    popcntq 104(%rdi), %rdx
+; SSE-NEXT:    popcntq 96(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 88(%rdi), %rax
+; SSE-NEXT:    addl %ecx, %esi
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 80(%rdi), %rdx
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 72(%rdi), %rax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq 64(%rdi), %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 56(%rdi), %rax
+; SSE-NEXT:    addl %esi, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 48(%rdi), %rdx
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 40(%rdi), %rax
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    popcntq 32(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %esi
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 24(%rdi), %rax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 16(%rdi), %rdx
+; SSE-NEXT:    popcntq 8(%rdi), %r8
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 120(%rdi), %rax
+; AVX2-NEXT:    popcntq 112(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 104(%rdi), %rax
+; AVX2-NEXT:    popcntq 96(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 88(%rdi), %rax
+; AVX2-NEXT:    popcntq 80(%rdi), %rsi
+; AVX2-NEXT:    popcntq 72(%rdi), %r8
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq 64(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %esi
+; AVX2-NEXT:    addl %r8d, %ecx
+; AVX2-NEXT:    addl %esi, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 56(%rdi), %rax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq 48(%rdi), %rdx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    popcntq 40(%rdi), %rsi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    popcntq 32(%rdi), %r8
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %esi, %r8d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 24(%rdi), %rax
+; AVX2-NEXT:    addl %edx, %r8d
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq 16(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    popcntq 8(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 120(%rdi), %rax
+; AVX512-NEXT:    popcntq 112(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 104(%rdi), %rax
+; AVX512-NEXT:    popcntq 96(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    addl %ecx, %edx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 88(%rdi), %rax
+; AVX512-NEXT:    popcntq 80(%rdi), %rsi
+; AVX512-NEXT:    popcntq 72(%rdi), %r8
+; AVX512-NEXT:    addl %eax, %esi
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq 64(%rdi), %rcx
+; AVX512-NEXT:    addl %r8d, %ecx
+; AVX512-NEXT:    addl %esi, %ecx
+; AVX512-NEXT:    addl %edx, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 56(%rdi), %rax
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq 48(%rdi), %rdx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    popcntq 40(%rdi), %rsi
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    xorl %r8d, %r8d
+; AVX512-NEXT:    popcntq 32(%rdi), %r8
+; AVX512-NEXT:    addl %esi, %r8d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    addl %edx, %r8d
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq 16(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    popcntq 8(%rdi), %rsi
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %esi, %eax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %r8d, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+;
+; CTLZ
+;
+
+define i32 @test_ctlz_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    lzcntq %rsi, %rcx
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    lzcntq %rsi, %rcx
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 8(%rdi), %rcx
+; AVX512-NEXT:    lzcntq %rcx, %rdx
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/164450


More information about the llvm-commits mailing list