[llvm-bugs] [Bug 51786] New: Failure to vectorize max element loop when it iterates over a long range
via llvm-bugs
llvm-bugs at lists.llvm.org
Tue Sep 7 13:42:31 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=51786
Bug ID: 51786
Summary: Failure to vectorize max element loop when it iterates
over a long range
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: gabravier at gmail.com
CC: llvm-bugs at lists.llvm.org
int test(int (&arr)[76])
{
int *first = &arr[0];
int *last = &arr[76];
int *largest = first++;
for (; first != last; ++first)
if (*largest < *first)
largest = first;
return *largest;
}
With -O3 -mavx2, GCC optimizes the function to this:
test(int (&) [76]):
vmovdqu ymm2, YMMWORD PTR [rdi+36]
vpbroadcastd ymm0, DWORD PTR [rdi]
vpmaxsd ymm1, ymm2, YMMWORD PTR [rdi+68]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+4]
vmovdqu ymm3, YMMWORD PTR [rdi+100]
vmovdqu ymm4, YMMWORD PTR [rdi+196]
vpmaxsd ymm1, ymm1, ymm0
vpmaxsd ymm0, ymm3, YMMWORD PTR [rdi+132]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+164]
vpmaxsd ymm1, ymm1, ymm0
vpmaxsd ymm0, ymm4, YMMWORD PTR [rdi+228]
vpmaxsd ymm0, ymm0, YMMWORD PTR [rdi+260]
vpmaxsd ymm1, ymm1, ymm0
vextracti128 xmm0, ymm1, 0x1
vpmaxsd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 8
vpmaxsd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 4
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+292]
vinsertps xmm0, xmm0, xmm0, 0xe
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+296]
vpmaxsd xmm0, xmm0, xmm1
vmovd xmm1, DWORD PTR [rdi+300]
vpmaxsd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
ret
LLVM instead outputs this:
test(int (&) [76]): # @test(int (&) [76])
lea rax, [rdi + 304]
mov rcx, rdi
add rcx, 4
jmp .LBB0_1
.LBB0_11: # in Loop: Header=BB0_1 Depth=1
mov rdi, rdx
add rcx, 20
cmp rcx, rax
je .LBB0_13
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov esi, dword ptr [rdi]
mov rdx, rcx
cmp esi, dword ptr [rcx]
jge .LBB0_2
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 4]
jl .LBB0_4
.LBB0_5: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 8]
jl .LBB0_6
.LBB0_7: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 12]
jl .LBB0_8
.LBB0_9: # in Loop: Header=BB0_1 Depth=1
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 16]
jge .LBB0_11
jmp .LBB0_10
.LBB0_2: # in Loop: Header=BB0_1 Depth=1
mov rdx, rdi
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 4]
jge .LBB0_5
.LBB0_4: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 4]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 8]
jge .LBB0_7
.LBB0_6: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 8]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 12]
jge .LBB0_9
.LBB0_8: # in Loop: Header=BB0_1 Depth=1
lea rdx, [rcx + 12]
mov esi, dword ptr [rdx]
cmp esi, dword ptr [rcx + 16]
jge .LBB0_11
.LBB0_10: # in Loop: Header=BB0_1 Depth=1
lea rdi, [rcx + 16]
add rcx, 20
cmp rcx, rax
jne .LBB0_1
.LBB0_13:
mov eax, dword ptr [rdi]
ret
For anything smaller, LLVM seems capable of doing this optimization, but as
soon as it gets to 76, it isn't vectorized anymore
Godbolt link: https://godbolt.org/z/zbKa8fjhj
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210907/ee22aceb/attachment.html>
More information about the llvm-bugs
mailing list