[llvm-bugs] [Bug 32523] New: Missed optimization: no optimization over loop boundaries

Tue Apr 4 12:51:54 PDT 2017

https://bugs.llvm.org/show_bug.cgi?id=32523

            Bug ID: 32523
           Summary: Missed optimization: no optimization over loop
                    boundaries
           Product: Polly
           Version: unspecified
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Optimizer
          Assignee: polly-dev at googlegroups.com
          Reporter: antoshkka at gmail.com
                CC: llvm-bugs at lists.llvm.org

Consider the following example:

///////////////
using size_t = unsigned long;

template <class T>
struct shared_ptr_like {
    T* ptr;
    void foo() noexcept { ptr = 0; }
    void bar() noexcept { if (ptr) { delete ptr; } }
};

typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, size_t size) {
    for (size_t i = 0; i != size; ++i) {
        new_buffer[i].foo();
    }
    for (size_t i = 0; i != size; ++i) {
        new_buffer[i].bar();
    }
}
///////////////

It produces the following assembly:

relocate(shared_ptr_like<int>*, unsigned long):      #
@relocate(shared_ptr_like<int>*, unsigned long)
        push    r14
        push    rbx
        push    rax
        mov     r14, rsi
        mov     rbx, rdi
        test    r14, r14
        je      .LBB0_5
        lea     rdx, [8*r14]
        xor     esi, esi
        mov     rdi, rbx
        call    memset
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        mov     rdi, qword ptr [rbx]
        test    rdi, rdi
        je      .LBB0_4
        call    operator delete(void*)
.LBB0_4:                                #   in Loop: Header=BB0_2 Depth=1
        add     rbx, 8
        dec     r14
        jne     .LBB0_2
.LBB0_5:
        add     rsp, 8
        pop     rbx
        pop     r14
        ret

However the optimal assembly would be:

relocate(shared_ptr_like<int>*, unsigned long):      #
@relocate(shared_ptr_like<int>*, unsigned long)
        mov     rax, rsi
        test    rax, rax
        je      .LBB0_2
        push    rax
        shl     rax, 3
        xor     esi, esi
        mov     rdx, rax
        call    memset
        add     rsp, 8
.LBB0_2:
        ret

Assembly from above could be produced by following code that just hase a single
loop instead of two:

typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, size_t size) {
    for (size_t i = 0; i != size; ++i) { // Single loop instead of two
        new_buffer[i].foo();
        new_buffer[i].bar();
    }
}

Optimizing over loop boundries is essential because multiple parts of C++
standard library traverse the same data twice. For example GCC's
std::vector::reserve has the following code:

  pointer __tmp = _M_allocate_and_copy(__n,
    _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
    _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
  std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
                _M_get_Tp_allocator()); 

Checked on CLANG 4.0 (tags/RELEASE_400/final 297782) with flags -std=c++1z -O2

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170404/dfa9d9ec/attachment.html>