<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Missed optimization opportunity: memberwise copy of aggregate"

   href="https://bugs.llvm.org/show_bug.cgi?id=52436">52436</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Missed optimization opportunity: memberwise copy of aggregate

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>roman.zelenyi@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>htmldeveloper@gmail.com, llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Clang generates inefficient code while copying some aggregates by member. 

For example:

//{{

struct S { short m0, m1, m2, m3; };

void copy(S * __restrict pd, S const * ps, std::size_t c) {

    for (std::size_t i{0}; i < c; ++i) {

        pd[i].m0 = ps[i].m0;

        pd[i].m1 = ps[i].m1;

        pd[i].m2 = ps[i].m2;

        pd[i].m3 = ps[i].m3;        

    }

}

//}}

For x64 target clang with -O2/-O3 compiles this code to:

copy(S*, S const*, unsigned long):                        # @copy(S*, S const*,

unsigned long)

        test    rdx, rdx

        je      .LBB0_5

        cmp     rdx, 1

        jne     .LBB0_6

        xor     eax, eax

        jmp     .LBB0_3

.LBB0_6:

        mov     r8, rdx

        and     r8, -2

        xor     eax, eax

.LBB0_7:                                # =>This Inner Loop Header: Depth=1

        movzx   ecx, word ptr [rsi + 8*rax]

        mov     word ptr [rdi + 8*rax], cx

        movzx   ecx, word ptr [rsi + 8*rax + 2]

        mov     word ptr [rdi + 8*rax + 2], cx

        movzx   ecx, word ptr [rsi + 8*rax + 4]

        mov     word ptr [rdi + 8*rax + 4], cx

        movzx   ecx, word ptr [rsi + 8*rax + 6]

        mov     word ptr [rdi + 8*rax + 6], cx

        movzx   ecx, word ptr [rsi + 8*rax + 8]

        mov     word ptr [rdi + 8*rax + 8], cx

        movzx   ecx, word ptr [rsi + 8*rax + 10]

        mov     word ptr [rdi + 8*rax + 10], cx

        movzx   ecx, word ptr [rsi + 8*rax + 12]

        mov     word ptr [rdi + 8*rax + 12], cx

        movzx   ecx, word ptr [rsi + 8*rax + 14]

        mov     word ptr [rdi + 8*rax + 14], cx

        add     rax, 2

        cmp     r8, rax

        jne     .LBB0_7

.LBB0_3:

        test    dl, 1

        je      .LBB0_5

        movzx   ecx, word ptr [rsi + 8*rax]

        mov     word ptr [rdi + 8*rax], cx

        movzx   ecx, word ptr [rsi + 8*rax + 2]

        mov     word ptr [rdi + 8*rax + 2], cx

        movzx   ecx, word ptr [rsi + 8*rax + 4]

        mov     word ptr [rdi + 8*rax + 4], cx

        movzx   ecx, word ptr [rsi + 8*rax + 6]

        mov     word ptr [rdi + 8*rax + 6], cx

.LBB0_5:

        ret

GCC generates much more efficient code here:

copy(S*, S const*, unsigned long):

        test    rdx, rdx

        je      .L1

        cmp     rdx, 1

        je      .L6

        mov     rcx, rdx

        xor     eax, eax

        shr     rcx

        sal     rcx, 4

.L4:

        movdqu  xmm0, XMMWORD PTR [rsi+rax]

        movups  XMMWORD PTR [rdi+rax], xmm0

        add     rax, 16

        cmp     rax, rcx

        jne     .L4

        test    dl, 1

        je      .L1

        and     rdx, -2

.L3:

        mov     rax, QWORD PTR [rsi+rdx*8]

        mov     QWORD PTR [rdi+rdx*8], rax

.L1:

        ret

.L6:

        xor     edx, edx

        jmp     .L3

This defect can be important for generic programming where explicit

optimization of such cases can be tricky or cumbersome.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>