[llvm-bugs] [Bug 40095] New: STL iterator std::copy/for loop produces inefficient code
via llvm-bugs
llvm-bugs at lists.llvm.org
Tue Dec 18 19:59:52 PST 2018
https://bugs.llvm.org/show_bug.cgi?id=40095
Bug ID: 40095
Summary: STL iterator std::copy/for loop produces inefficient
code
Product: libraries
Version: trunk
Hardware: PC
OS: Windows NT
Status: NEW
Severity: enhancement
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: husseydevin at gmail.com
CC: llvm-bugs at lists.llvm.org
This code was tested with libstdc++ on Godbolt.
>From looking at Godbolt versions, I noticed that clang 6.0 will convert this
code:
unsigned copy_equal(const std::vector<unsigned char> &__restrict vec)
{
unsigned ret;
unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);
auto last = vec.cbegin() + sizeof(unsigned);
auto first = vec.cbegin();
while (first != last) {
*result = *first;
++first;
++result;
}
return ret;
}
as expected to this (clang -O3 -m32):
copy_equal(std::vector<unsigned char, std::allocator<unsigned char> > const&):
# @copy_equal(std::vector<unsigned char, std::allocator<unsigned char> >
const&)
mov eax, dword ptr [esp + 4]
mov eax, dword ptr [eax]
mov eax, dword ptr [eax]
ret
GCC has done this same optimization since 4.9, although it aligns the stack
first.
h
unsigned copy_reverse(const std::vector<unsigned char> &__restrict vec)
{
unsigned ret;
unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);
auto last = vec.cbegin() + sizeof(unsigned);
auto first = vec.cbegin();
for (auto n = last - first; n > 0; --n) {
*result = *first;
++first;
++result;
}
return ret;
}
we get this:
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
xor eax, eax
test al, al
jne .LBB0_1
push ebx
push esi
push eax
mov eax, dword ptr [esp + 16]
mov edx, 4
mov ecx, esp
cmp edx, 32
mov eax, dword ptr [eax]
jb .LBB0_6
xor esi, esi
.LBB0_4: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [eax + esi]
movups xmm1, xmmword ptr [eax + esi + 16]
movups xmmword ptr [esp + esi], xmm0
movups xmmword ptr [esp + esi + 16], xmm1
add esi, 32
jne .LBB0_4
test edx, edx
je .LBB0_8
.LBB0_6:
lea edx, [eax + 4]
xor esi, esi
sub edx, ecx
sub edx, eax
lea edx, [esp + edx]
.LBB0_7: # =>This Inner Loop Header: Depth=1
movzx ebx, byte ptr [eax + esi]
mov byte ptr [ecx + esi], bl
inc esi
cmp edx, esi
jne .LBB0_7
.LBB0_8:
mov eax, dword ptr [esp]
add esp, 4
pop esi
pop ebx
ret
.LBB0_1:
ret
or, with -mno-sse,
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
xor eax, eax
test al, al
jne .LBB0_1
push ebx
push eax
mov ecx, dword ptr [esp + 12]
mov edx, 4
mov ecx, dword ptr [ecx]
.LBB0_3: # =>This Inner Loop Header: Depth=1
movzx ebx, byte ptr [ecx + eax]
mov byte ptr [esp + eax], bl
inc eax
cmp edx, eax
jne .LBB0_3
mov eax, dword ptr [esp]
add esp, 4
pop ebx
ret
While the recommended way is the first one (even suggested on
cppreference.com), the second one is important because libstdc++ uses that for
their copy iterator on STL containers:
template <>
struct __copy_move<false, false, random_access_iterator_tag> {
template <typename _II, typename _OI>
static _OI __copy_m(_II __first, _II __last, _OI __result) {
typedef typename iterator_traits<_II>::difference_type _Distance;
for (_Distance __n = __last - __first; __n > 0; --__n) {
*__result = *__first;
++__first;
++__result;
}
return __result;
}
};
As a result, std::copy on libstdc++ is inefficient here.
This affects all targets, although x86_64 seems to generate slightly better
code than others, and i386 generating slightly worse code than the others.
x86_64 output:
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
mov rax, qword ptr [rdi]
mov cl, byte ptr [rax]
mov byte ptr [rsp - 4], cl
mov cl, byte ptr [rax + 1]
mov byte ptr [rsp - 3], cl
mov cl, byte ptr [rax + 2]
mov byte ptr [rsp - 2], cl
mov al, byte ptr [rax + 3]
mov byte ptr [rsp - 1], al
mov eax, dword ptr [rsp - 4]
ret
Comparison: https://godbolt.org/z/wwr8Sz
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181219/1d0a10f5/attachment.html>
More information about the llvm-bugs
mailing list