<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - STL iterator std::copy/for loop produces inefficient code"

   href="https://bugs.llvm.org/show_bug.cgi?id=40095">40095</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>STL iterator std::copy/for loop produces inefficient code

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>husseydevin@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This code was tested with libstdc++ on Godbolt.

>From looking at Godbolt versions, I noticed that clang 6.0 will convert this

code:

unsigned copy_equal(const std::vector<unsigned char> &__restrict vec)

{

    unsigned ret;

    unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);

    auto last = vec.cbegin() + sizeof(unsigned);

    auto first = vec.cbegin();

    while (first != last) {

        *result = *first;

        ++first;

        ++result;

    }

    return ret;

}

as expected to this (clang -O3 -m32):

copy_equal(std::vector<unsigned char, std::allocator<unsigned char> > const&): 

    # @copy_equal(std::vector<unsigned char, std::allocator<unsigned char> >

const&)

        mov     eax, dword ptr [esp + 4]

        mov     eax, dword ptr [eax]

        mov     eax, dword ptr [eax]

        ret

GCC has done this same optimization since 4.9, although it aligns the stack

first.

h

unsigned copy_reverse(const std::vector<unsigned char> &__restrict vec)

{

    unsigned ret;

    unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);

    auto last = vec.cbegin() + sizeof(unsigned);

    auto first = vec.cbegin();

    for (auto n = last - first; n > 0; --n) {

        *result = *first;

        ++first;

        ++result;

    }

    return ret;

}

we get this:

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >

const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned

char> > const&)

        xor     eax, eax

        test    al, al

        jne     .LBB0_1

        push    ebx

        push    esi

        push    eax

        mov     eax, dword ptr [esp + 16]

        mov     edx, 4

        mov     ecx, esp

        cmp     edx, 32

        mov     eax, dword ptr [eax]

        jb      .LBB0_6

        xor     esi, esi

.LBB0_4:                                # =>This Inner Loop Header: Depth=1

        movups  xmm0, xmmword ptr [eax + esi]

        movups  xmm1, xmmword ptr [eax + esi + 16]

        movups  xmmword ptr [esp + esi], xmm0

        movups  xmmword ptr [esp + esi + 16], xmm1

        add     esi, 32

        jne     .LBB0_4

        test    edx, edx

        je      .LBB0_8

.LBB0_6:

        lea     edx, [eax + 4]

        xor     esi, esi

        sub     edx, ecx

        sub     edx, eax

        lea     edx, [esp + edx]

.LBB0_7:                                # =>This Inner Loop Header: Depth=1

        movzx   ebx, byte ptr [eax + esi]

        mov     byte ptr [ecx + esi], bl

        inc     esi

        cmp     edx, esi

        jne     .LBB0_7

.LBB0_8:

        mov     eax, dword ptr [esp]

        add     esp, 4

        pop     esi

        pop     ebx

        ret

.LBB0_1:

        ret

or, with -mno-sse,

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >

const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned

char> > const&)

        xor     eax, eax

        test    al, al

        jne     .LBB0_1

        push    ebx

        push    eax

        mov     ecx, dword ptr [esp + 12]

        mov     edx, 4

        mov     ecx, dword ptr [ecx]

.LBB0_3:                                # =>This Inner Loop Header: Depth=1

        movzx   ebx, byte ptr [ecx + eax]

        mov     byte ptr [esp + eax], bl

        inc     eax

        cmp     edx, eax

        jne     .LBB0_3

        mov     eax, dword ptr [esp]

        add     esp, 4

        pop     ebx

        ret

While the recommended way is the first one (even suggested on

cppreference.com), the second one is important because libstdc++ uses that for

their copy iterator on STL containers:

  template <>

  struct __copy_move<false, false, random_access_iterator_tag> {

    template <typename _II, typename _OI>

    static _OI __copy_m(_II __first, _II __last, _OI __result) {

      typedef typename iterator_traits<_II>::difference_type _Distance;

      for (_Distance __n = __last - __first; __n > 0; --__n) {

        *__result = *__first;

        ++__first;

        ++__result;

      }

      return __result;

    }

  };

As a result, std::copy on libstdc++ is inefficient here.

This affects all targets, although x86_64 seems to generate slightly better

code than others, and i386 generating slightly worse code than the others.

x86_64 output:

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >

const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned

char> > const&)

        mov     rax, qword ptr [rdi]

        mov     cl, byte ptr [rax]

        mov     byte ptr [rsp - 4], cl

        mov     cl, byte ptr [rax + 1]

        mov     byte ptr [rsp - 3], cl

        mov     cl, byte ptr [rax + 2]

        mov     byte ptr [rsp - 2], cl

        mov     al, byte ptr [rax + 3]

        mov     byte ptr [rsp - 1], al

        mov     eax, dword ptr [rsp - 4]

        ret

Comparison: <a href="https://godbolt.org/z/wwr8Sz">https://godbolt.org/z/wwr8Sz</a></pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>