[llvm-bugs] [Bug 40095] New: STL iterator std::copy/for loop produces inefficient code

via llvm-bugs llvm-bugs at lists.llvm.org
Tue Dec 18 19:59:52 PST 2018


https://bugs.llvm.org/show_bug.cgi?id=40095

            Bug ID: 40095
           Summary: STL iterator std::copy/for loop produces inefficient
                    code
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: husseydevin at gmail.com
                CC: llvm-bugs at lists.llvm.org

This code was tested with libstdc++ on Godbolt.

>From looking at Godbolt versions, I noticed that clang 6.0 will convert this
code:

unsigned copy_equal(const std::vector<unsigned char> &__restrict vec)
{
    unsigned ret;
    unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);

    auto last = vec.cbegin() + sizeof(unsigned);
    auto first = vec.cbegin();
    while (first != last) {
        *result = *first;
        ++first;
        ++result;
    }
    return ret;
}

as expected to this (clang -O3 -m32):

copy_equal(std::vector<unsigned char, std::allocator<unsigned char> > const&): 
    # @copy_equal(std::vector<unsigned char, std::allocator<unsigned char> >
const&)
        mov     eax, dword ptr [esp + 4]
        mov     eax, dword ptr [eax]
        mov     eax, dword ptr [eax]
        ret

GCC has done this same optimization since 4.9, although it aligns the stack
first.

h

unsigned copy_reverse(const std::vector<unsigned char> &__restrict vec)
{
    unsigned ret;
    unsigned char *__restrict result = reinterpret_cast<unsigned char*>(&ret);

    auto last = vec.cbegin() + sizeof(unsigned);
    auto first = vec.cbegin();

    for (auto n = last - first; n > 0; --n) {
        *result = *first;
        ++first;
        ++result;
    }
    return ret;
}

we get this:

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
        xor     eax, eax
        test    al, al
        jne     .LBB0_1
        push    ebx
        push    esi
        push    eax
        mov     eax, dword ptr [esp + 16]
        mov     edx, 4
        mov     ecx, esp
        cmp     edx, 32
        mov     eax, dword ptr [eax]
        jb      .LBB0_6
        xor     esi, esi
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
        movups  xmm0, xmmword ptr [eax + esi]
        movups  xmm1, xmmword ptr [eax + esi + 16]
        movups  xmmword ptr [esp + esi], xmm0
        movups  xmmword ptr [esp + esi + 16], xmm1
        add     esi, 32
        jne     .LBB0_4
        test    edx, edx
        je      .LBB0_8
.LBB0_6:
        lea     edx, [eax + 4]
        xor     esi, esi
        sub     edx, ecx
        sub     edx, eax
        lea     edx, [esp + edx]
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
        movzx   ebx, byte ptr [eax + esi]
        mov     byte ptr [ecx + esi], bl
        inc     esi
        cmp     edx, esi
        jne     .LBB0_7
.LBB0_8:
        mov     eax, dword ptr [esp]
        add     esp, 4
        pop     esi
        pop     ebx
        ret
.LBB0_1:
        ret

or, with -mno-sse,

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
        xor     eax, eax
        test    al, al
        jne     .LBB0_1
        push    ebx
        push    eax
        mov     ecx, dword ptr [esp + 12]
        mov     edx, 4
        mov     ecx, dword ptr [ecx]
.LBB0_3:                                # =>This Inner Loop Header: Depth=1
        movzx   ebx, byte ptr [ecx + eax]
        mov     byte ptr [esp + eax], bl
        inc     eax
        cmp     edx, eax
        jne     .LBB0_3
        mov     eax, dword ptr [esp]
        add     esp, 4
        pop     ebx
        ret

While the recommended way is the first one (even suggested on
cppreference.com), the second one is important because libstdc++ uses that for
their copy iterator on STL containers:

  template <>
  struct __copy_move<false, false, random_access_iterator_tag> {
    template <typename _II, typename _OI>
    static _OI __copy_m(_II __first, _II __last, _OI __result) {
      typedef typename iterator_traits<_II>::difference_type _Distance;
      for (_Distance __n = __last - __first; __n > 0; --__n) {

        *__result = *__first;
        ++__first;
        ++__result;
      }
      return __result;
    }
  };

As a result, std::copy on libstdc++ is inefficient here.

This affects all targets, although x86_64 seems to generate slightly better
code than others, and i386 generating slightly worse code than the others.

x86_64 output:

copy_reverse(std::vector<unsigned char, std::allocator<unsigned char> >
const&):    # @copy_reverse(std::vector<unsigned char, std::allocator<unsigned
char> > const&)
        mov     rax, qword ptr [rdi]
        mov     cl, byte ptr [rax]
        mov     byte ptr [rsp - 4], cl
        mov     cl, byte ptr [rax + 1]
        mov     byte ptr [rsp - 3], cl
        mov     cl, byte ptr [rax + 2]
        mov     byte ptr [rsp - 2], cl
        mov     al, byte ptr [rax + 3]
        mov     byte ptr [rsp - 1], al
        mov     eax, dword ptr [rsp - 4]
        ret

Comparison: https://godbolt.org/z/wwr8Sz

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181219/1d0a10f5/attachment.html>


More information about the llvm-bugs mailing list