<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Missed optimization opportunity: memberwise copy of aggregate"
href="https://bugs.llvm.org/show_bug.cgi?id=52436">52436</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Missed optimization opportunity: memberwise copy of aggregate
</td>
</tr>
<tr>
<th>Product</th>
<td>new-bugs
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>new bugs
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>roman.zelenyi@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>htmldeveloper@gmail.com, llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Clang generates inefficient code while copying some aggregates by member.
For example:
//{{
struct S { short m0, m1, m2, m3; };
void copy(S * __restrict pd, S const * ps, std::size_t c) {
for (std::size_t i{0}; i < c; ++i) {
pd[i].m0 = ps[i].m0;
pd[i].m1 = ps[i].m1;
pd[i].m2 = ps[i].m2;
pd[i].m3 = ps[i].m3;
}
}
//}}
For x64 target clang with -O2/-O3 compiles this code to:
copy(S*, S const*, unsigned long): # @copy(S*, S const*,
unsigned long)
test rdx, rdx
je .LBB0_5
cmp rdx, 1
jne .LBB0_6
xor eax, eax
jmp .LBB0_3
.LBB0_6:
mov r8, rdx
and r8, -2
xor eax, eax
.LBB0_7: # =>This Inner Loop Header: Depth=1
movzx ecx, word ptr [rsi + 8*rax]
mov word ptr [rdi + 8*rax], cx
movzx ecx, word ptr [rsi + 8*rax + 2]
mov word ptr [rdi + 8*rax + 2], cx
movzx ecx, word ptr [rsi + 8*rax + 4]
mov word ptr [rdi + 8*rax + 4], cx
movzx ecx, word ptr [rsi + 8*rax + 6]
mov word ptr [rdi + 8*rax + 6], cx
movzx ecx, word ptr [rsi + 8*rax + 8]
mov word ptr [rdi + 8*rax + 8], cx
movzx ecx, word ptr [rsi + 8*rax + 10]
mov word ptr [rdi + 8*rax + 10], cx
movzx ecx, word ptr [rsi + 8*rax + 12]
mov word ptr [rdi + 8*rax + 12], cx
movzx ecx, word ptr [rsi + 8*rax + 14]
mov word ptr [rdi + 8*rax + 14], cx
add rax, 2
cmp r8, rax
jne .LBB0_7
.LBB0_3:
test dl, 1
je .LBB0_5
movzx ecx, word ptr [rsi + 8*rax]
mov word ptr [rdi + 8*rax], cx
movzx ecx, word ptr [rsi + 8*rax + 2]
mov word ptr [rdi + 8*rax + 2], cx
movzx ecx, word ptr [rsi + 8*rax + 4]
mov word ptr [rdi + 8*rax + 4], cx
movzx ecx, word ptr [rsi + 8*rax + 6]
mov word ptr [rdi + 8*rax + 6], cx
.LBB0_5:
ret
GCC generates much more efficient code here:
copy(S*, S const*, unsigned long):
test rdx, rdx
je .L1
cmp rdx, 1
je .L6
mov rcx, rdx
xor eax, eax
shr rcx
sal rcx, 4
.L4:
movdqu xmm0, XMMWORD PTR [rsi+rax]
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L4
test dl, 1
je .L1
and rdx, -2
.L3:
mov rax, QWORD PTR [rsi+rdx*8]
mov QWORD PTR [rdi+rdx*8], rax
.L1:
ret
.L6:
xor edx, edx
jmp .L3
This defect can be important for generic programming where explicit
optimization of such cases can be tricky or cumbersome.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>