<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Failure to vectorize max element loop when it iterates over a long range"
href="https://bugs.llvm.org/show_bug.cgi?id=51786">51786</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Failure to vectorize max element loop when it iterates over a long range
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Loop Optimizer
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>gabravier@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>int test(int (&arr)[76])
{
int *first = &arr[0];
int *last = &arr[76];
int *largest = first++;
for (; first != last; ++first)
if (*largest < *first)
largest = first;
return *largest;
}
With -O3 -mavx2, GCC optimizes the function to this:
test(int (&) [76]):
vmovdqu ymm2, YMMWORD PTR [rdi+36]
vpbroadcastd ymm0, DWORD PTR [rdi]
vpmaxsd ymm1, ymm2, YMMWORD PTR [rdi+68]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+4]
vmovdqu ymm3, YMMWORD PTR [rdi+100]
vmovdqu ymm4, YMMWORD PTR [rdi+196]
vpmaxsd ymm1, ymm1, ymm0
vpmaxsd ymm0, ymm3, YMMWORD PTR [rdi+132]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+164]
vpmaxsd ymm1, ymm1, ymm0
vpmaxsd ymm0, ymm4, YMMWORD PTR [rdi+228]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+260]
vpmaxsd ymm1, ymm1, ymm0
vextracti128 xmm0, ymm1, 0x1
vpmaxsd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 8
vpmaxsd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 4
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+292]
vinsertps xmm0, xmm0, xmm0, 0xe
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+296]
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+300]
vpmaxsd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
ret
LLVM instead outputs this:
test(int (&) [76]): # @test(int (&) [76])
lea rax, [rdi + 304]
mov rcx, rdi
add rcx, 4
jmp .LBB0_1
.LBB0_11: # in Loop: Header=BB0_1 Depth=1
mov rdi, rdx
add rcx, 20
cmp rcx, rax
je .LBB0_13
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov esi, dword ptr [rdi]
mov rdx, rcx
cmp esi, dword ptr [rcx]
jge .LBB0_2
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 4]
jl .LBB0_4
.LBB0_5: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 8]
jl .LBB0_6
.LBB0_7: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 12]
jl .LBB0_8
.LBB0_9: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 16]
jge .LBB0_11
jmp .LBB0_10
.LBB0_2: # in Loop: Header=BB0_1 Depth=1
mov rdx, rdi
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 4]
jge .LBB0_5
.LBB0_4: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 4]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 8]
jge .LBB0_7
.LBB0_6: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 8]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 12]
jge .LBB0_9
.LBB0_8: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 12]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 16]
jge .LBB0_11
.LBB0_10: # in Loop: Header=BB0_1 Depth=1
lea rdi, [rcx + 16]
add rcx, 20
cmp rcx, rax
jne .LBB0_1
.LBB0_13:
mov eax, dword ptr [rdi]
ret
For anything smaller, LLVM seems capable of doing this optimization, but as
soon as it gets to 76, it isn't vectorized anymore
Godbolt link: <a href="https://godbolt.org/z/zbKa8fjhj">https://godbolt.org/z/zbKa8fjhj</a></pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>