[llvm-bugs] [Bug 52436] New: Missed optimization opportunity: memberwise copy of aggregate

Sun Nov 7 14:26:14 PST 2021

https://bugs.llvm.org/show_bug.cgi?id=52436

            Bug ID: 52436
           Summary: Missed optimization opportunity: memberwise copy of
                    aggregate
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: roman.zelenyi at gmail.com
                CC: htmldeveloper at gmail.com, llvm-bugs at lists.llvm.org

Clang generates inefficient code while copying some aggregates by member. 
For example:

//{{
struct S { short m0, m1, m2, m3; };

void copy(S * __restrict pd, S const * ps, std::size_t c) {
    for (std::size_t i{0}; i < c; ++i) {
        pd[i].m0 = ps[i].m0;
        pd[i].m1 = ps[i].m1;
        pd[i].m2 = ps[i].m2;
        pd[i].m3 = ps[i].m3;        
    }
}
//}}

For x64 target clang with -O2/-O3 compiles this code to:

copy(S*, S const*, unsigned long):                        # @copy(S*, S const*,
unsigned long)
        test    rdx, rdx
        je      .LBB0_5
        cmp     rdx, 1
        jne     .LBB0_6
        xor     eax, eax
        jmp     .LBB0_3
.LBB0_6:
        mov     r8, rdx
        and     r8, -2
        xor     eax, eax
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
        movzx   ecx, word ptr [rsi + 8*rax]
        mov     word ptr [rdi + 8*rax], cx
        movzx   ecx, word ptr [rsi + 8*rax + 2]
        mov     word ptr [rdi + 8*rax + 2], cx
        movzx   ecx, word ptr [rsi + 8*rax + 4]
        mov     word ptr [rdi + 8*rax + 4], cx
        movzx   ecx, word ptr [rsi + 8*rax + 6]
        mov     word ptr [rdi + 8*rax + 6], cx
        movzx   ecx, word ptr [rsi + 8*rax + 8]
        mov     word ptr [rdi + 8*rax + 8], cx
        movzx   ecx, word ptr [rsi + 8*rax + 10]
        mov     word ptr [rdi + 8*rax + 10], cx
        movzx   ecx, word ptr [rsi + 8*rax + 12]
        mov     word ptr [rdi + 8*rax + 12], cx
        movzx   ecx, word ptr [rsi + 8*rax + 14]
        mov     word ptr [rdi + 8*rax + 14], cx
        add     rax, 2
        cmp     r8, rax
        jne     .LBB0_7
.LBB0_3:
        test    dl, 1
        je      .LBB0_5
        movzx   ecx, word ptr [rsi + 8*rax]
        mov     word ptr [rdi + 8*rax], cx
        movzx   ecx, word ptr [rsi + 8*rax + 2]
        mov     word ptr [rdi + 8*rax + 2], cx
        movzx   ecx, word ptr [rsi + 8*rax + 4]
        mov     word ptr [rdi + 8*rax + 4], cx
        movzx   ecx, word ptr [rsi + 8*rax + 6]
        mov     word ptr [rdi + 8*rax + 6], cx
.LBB0_5:
        ret

GCC generates much more efficient code here:

copy(S*, S const*, unsigned long):
        test    rdx, rdx
        je      .L1
        cmp     rdx, 1
        je      .L6
        mov     rcx, rdx
        xor     eax, eax
        shr     rcx
        sal     rcx, 4
.L4:
        movdqu  xmm0, XMMWORD PTR [rsi+rax]
        movups  XMMWORD PTR [rdi+rax], xmm0
        add     rax, 16
        cmp     rax, rcx
        jne     .L4
        test    dl, 1
        je      .L1
        and     rdx, -2
.L3:
        mov     rax, QWORD PTR [rsi+rdx*8]
        mov     QWORD PTR [rdi+rdx*8], rax
.L1:
        ret
.L6:
        xor     edx, edx
        jmp     .L3

This defect can be important for generic programming where explicit
optimization of such cases can be tricky or cumbersome.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211107/e861750d/attachment.html>