[llvm-bugs] [Bug 52436] New: Missed optimization opportunity: memberwise copy of aggregate
via llvm-bugs
llvm-bugs at lists.llvm.org
Sun Nov 7 14:26:14 PST 2021
https://bugs.llvm.org/show_bug.cgi?id=52436
Bug ID: 52436
Summary: Missed optimization opportunity: memberwise copy of
aggregate
Product: new-bugs
Version: trunk
Hardware: PC
OS: Windows NT
Status: NEW
Severity: enhancement
Priority: P
Component: new bugs
Assignee: unassignedbugs at nondot.org
Reporter: roman.zelenyi at gmail.com
CC: htmldeveloper at gmail.com, llvm-bugs at lists.llvm.org
Clang generates inefficient code while copying some aggregates by member.
For example:
//{{
struct S { short m0, m1, m2, m3; };
void copy(S * __restrict pd, S const * ps, std::size_t c) {
for (std::size_t i{0}; i < c; ++i) {
pd[i].m0 = ps[i].m0;
pd[i].m1 = ps[i].m1;
pd[i].m2 = ps[i].m2;
pd[i].m3 = ps[i].m3;
}
}
//}}
For x64 target clang with -O2/-O3 compiles this code to:
copy(S*, S const*, unsigned long): # @copy(S*, S const*,
unsigned long)
test rdx, rdx
je .LBB0_5
cmp rdx, 1
jne .LBB0_6
xor eax, eax
jmp .LBB0_3
.LBB0_6:
mov r8, rdx
and r8, -2
xor eax, eax
.LBB0_7: # =>This Inner Loop Header: Depth=1
movzx ecx, word ptr [rsi + 8*rax]
mov word ptr [rdi + 8*rax], cx
movzx ecx, word ptr [rsi + 8*rax + 2]
mov word ptr [rdi + 8*rax + 2], cx
movzx ecx, word ptr [rsi + 8*rax + 4]
mov word ptr [rdi + 8*rax + 4], cx
movzx ecx, word ptr [rsi + 8*rax + 6]
mov word ptr [rdi + 8*rax + 6], cx
movzx ecx, word ptr [rsi + 8*rax + 8]
mov word ptr [rdi + 8*rax + 8], cx
movzx ecx, word ptr [rsi + 8*rax + 10]
mov word ptr [rdi + 8*rax + 10], cx
movzx ecx, word ptr [rsi + 8*rax + 12]
mov word ptr [rdi + 8*rax + 12], cx
movzx ecx, word ptr [rsi + 8*rax + 14]
mov word ptr [rdi + 8*rax + 14], cx
add rax, 2
cmp r8, rax
jne .LBB0_7
.LBB0_3:
test dl, 1
je .LBB0_5
movzx ecx, word ptr [rsi + 8*rax]
mov word ptr [rdi + 8*rax], cx
movzx ecx, word ptr [rsi + 8*rax + 2]
mov word ptr [rdi + 8*rax + 2], cx
movzx ecx, word ptr [rsi + 8*rax + 4]
mov word ptr [rdi + 8*rax + 4], cx
movzx ecx, word ptr [rsi + 8*rax + 6]
mov word ptr [rdi + 8*rax + 6], cx
.LBB0_5:
ret
GCC generates much more efficient code here:
copy(S*, S const*, unsigned long):
test rdx, rdx
je .L1
cmp rdx, 1
je .L6
mov rcx, rdx
xor eax, eax
shr rcx
sal rcx, 4
.L4:
movdqu xmm0, XMMWORD PTR [rsi+rax]
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L4
test dl, 1
je .L1
and rdx, -2
.L3:
mov rax, QWORD PTR [rsi+rdx*8]
mov QWORD PTR [rdi+rdx*8], rax
.L1:
ret
.L6:
xor edx, edx
jmp .L3
This defect can be important for generic programming where explicit
optimization of such cases can be tricky or cumbersome.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211107/e861750d/attachment.html>
More information about the llvm-bugs
mailing list