<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - STL iterator std::copy/for loop produces inefficient code"
href="https://bugs.llvm.org/show_bug.cgi?id=40095">40095</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>STL iterator std::copy/for loop produces inefficient code
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Loop Optimizer
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>husseydevin@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>This code was tested with libstdc++ on Godbolt.
>From looking at Godbolt versions, I noticed that clang 6.0 will convert this
code:
unsigned copy_equal(const std::vector<unsigned char> &__restrict vec)
{
unsigned ret;
unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);
auto last = vec.cbegin() + sizeof(unsigned);
auto first = vec.cbegin();
while (first != last) {
*result = *first;
++first;
++result;
}
return ret;
}
as expected to this (clang -O3 -m32):
copy_equal(std::vector<unsigned char, std::allocator<unsigned char> > const&):
# @copy_equal(std::vector<unsigned char, std::allocator<unsigned char> >
const&)
mov eax, dword ptr [esp + 4]
mov eax, dword ptr [eax]
mov eax, dword ptr [eax]
ret
GCC has done this same optimization since 4.9, although it aligns the stack
first.
h
unsigned copy_reverse(const std::vector<unsigned char> &__restrict vec)
{
unsigned ret;
unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);
auto last = vec.cbegin() + sizeof(unsigned);
auto first = vec.cbegin();
for (auto n = last - first; n > 0; --n) {
*result = *first;
++first;
++result;
}
return ret;
}
we get this:
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
xor eax, eax
test al, al
jne .LBB0_1
push ebx
push esi
push eax
mov eax, dword ptr [esp + 16]
mov edx, 4
mov ecx, esp
cmp edx, 32
mov eax, dword ptr [eax]
jb .LBB0_6
xor esi, esi
.LBB0_4: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [eax + esi]
movups xmm1, xmmword ptr [eax + esi + 16]
movups xmmword ptr [esp + esi], xmm0
movups xmmword ptr [esp + esi + 16], xmm1
add esi, 32
jne .LBB0_4
test edx, edx
je .LBB0_8
.LBB0_6:
lea edx, [eax + 4]
xor esi, esi
sub edx, ecx
sub edx, eax
lea edx, [esp + edx]
.LBB0_7: # =>This Inner Loop Header: Depth=1
movzx ebx, byte ptr [eax + esi]
mov byte ptr [ecx + esi], bl
inc esi
cmp edx, esi
jne .LBB0_7
.LBB0_8:
mov eax, dword ptr [esp]
add esp, 4
pop esi
pop ebx
ret
.LBB0_1:
ret
or, with -mno-sse,
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
xor eax, eax
test al, al
jne .LBB0_1
push ebx
push eax
mov ecx, dword ptr [esp + 12]
mov edx, 4
mov ecx, dword ptr [ecx]
.LBB0_3: # =>This Inner Loop Header: Depth=1
movzx ebx, byte ptr [ecx + eax]
mov byte ptr [esp + eax], bl
inc eax
cmp edx, eax
jne .LBB0_3
mov eax, dword ptr [esp]
add esp, 4
pop ebx
ret
While the recommended way is the first one (even suggested on
cppreference.com), the second one is important because libstdc++ uses that for
their copy iterator on STL containers:
template <>
struct __copy_move<false, false, random_access_iterator_tag> {
template <typename _II, typename _OI>
static _OI __copy_m(_II __first, _II __last, _OI __result) {
typedef typename iterator_traits<_II>::difference_type _Distance;
for (_Distance __n = __last - __first; __n > 0; --__n) {
*__result = *__first;
++__first;
++__result;
}
return __result;
}
};
As a result, std::copy on libstdc++ is inefficient here.
This affects all targets, although x86_64 seems to generate slightly better
code than others, and i386 generating slightly worse code than the others.
x86_64 output:
copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&): # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
mov rax, qword ptr [rdi]
mov cl, byte ptr [rax]
mov byte ptr [rsp - 4], cl
mov cl, byte ptr [rax + 1]
mov byte ptr [rsp - 3], cl
mov cl, byte ptr [rax + 2]
mov byte ptr [rsp - 2], cl
mov al, byte ptr [rax + 3]
mov byte ptr [rsp - 1], al
mov eax, dword ptr [rsp - 4]
ret
Comparison: <a href="https://godbolt.org/z/wwr8Sz">https://godbolt.org/z/wwr8Sz</a></pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>