<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Load folding defeated by VPCMP to VTESTNM optimization"
href="https://bugs.llvm.org/show_bug.cgi?id=43815">43815</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Load folding defeated by VPCMP to VTESTNM optimization
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>All
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>dave@znu.io
</td>
</tr>
<tr>
<th>CC</th>
<td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com
</td>
</tr></table>
<p>
<div>
<pre>While trying to create an AVX512BW version of strlen() using the X86 clang
intrinsics, I noticed that load folding was being defeated by a VPCMP to
VTESTNM optimization. For example:
unsigned long avx512_strlen(const char *_ptr) {
__m512i zero = { 0 };
//asm("" : "+v" (zero));
const char *ptr = reinterpret_cast<const char
*>(reinterpret_cast<uintptr_t>(_ptr) & ~63ul);
uint64_t mask = ~0ul << (ptr - _ptr);
auto result = _mm512_cmp_epi8_mask(zero, *reinterpret_cast<const
__m512i*>(ptr), 0);
result &= mask;
if (!result) for (;;) {
ptr += 64;
result = _mm512_cmp_epi8_mask(zero, *reinterpret_cast<const
__m512i*>(ptr), 0);
if (result)
break;
}
ptr += __builtin_ctzll(result);
return ptr - _ptr;
}
Generates:
_ZL13avx512_strlenPKc: # @_ZL13avx512_strlenPKc
.cfi_startproc
# %bb.0:
movq %rdi, %rcx
andq $-64, %rcx
movl %ecx, %eax
subl %edi, %eax
vmovdqa64 (%rcx), %zmm0
vptestnmb %zmm0, %zmm0, %k0
kmovq %k0, %rdx
shrxq %rax, %rdx, %rdx
shlxq %rax, %rdx, %rax
testq %rax, %rax
je .LBB0_1
# %bb.3:
tzcntq %rax, %rax
addq %rcx, %rax
subq %rdi, %rax
vzeroupper
retq
.p2align 4, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
vmovdqa64 64(%rcx), %zmm0
addq $64, %rcx
vptestnmb %zmm0, %zmm0, %k0
kortestq %k0, %k0
je .LBB0_1
# %bb.2:
kmovq %k0, %rax
tzcntq %rax, %rax
addq %rcx, %rax
subq %rdi, %rax
vzeroupper
retq
But with the inline asm() uncommented, the desired code is emitted:
_ZL13avx512_strlenPKc: # @_ZL13avx512_strlenPKc
.cfi_startproc
# %bb.0:
vxorps %xmm0, %xmm0, %xmm0
#APP
#NO_APP
movq %rdi, %rcx
andq $-64, %rcx
movl %ecx, %eax
subl %edi, %eax
vpcmpeqb (%rcx), %zmm0, %k0
kmovq %k0, %rdx
shrxq %rax, %rdx, %rdx
shlxq %rax, %rdx, %rax
testq %rax, %rax
je .LBB0_1
# %bb.3:
tzcntq %rax, %rax
addq %rcx, %rax
subq %rdi, %rax
vzeroupper
retq
.p2align 4, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
vpcmpeqb 64(%rcx), %zmm0, %k0
addq $64, %rcx
kortestq %k0, %k0
je .LBB0_1
# %bb.2:
kmovq %k0, %rax
tzcntq %rax, %rax
addq %rcx, %rax
subq %rdi, %rax
vzeroupper
retq</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>