<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Missed optimization: no optimization over loop boundaries"
href="https://bugs.llvm.org/show_bug.cgi?id=32523">32523</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Missed optimization: no optimization over loop boundaries
</td>
</tr>
<tr>
<th>Product</th>
<td>Polly
</td>
</tr>
<tr>
<th>Version</th>
<td>unspecified
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Optimizer
</td>
</tr>
<tr>
<th>Assignee</th>
<td>polly-dev@googlegroups.com
</td>
</tr>
<tr>
<th>Reporter</th>
<td>antoshkka@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Consider the following example:
///////////////
using size_t = unsigned long;
template <class T>
struct shared_ptr_like {
T* ptr;
void foo() noexcept { ptr = 0; }
void bar() noexcept { if (ptr) { delete ptr; } }
};
typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, size_t size) {
for (size_t i = 0; i != size; ++i) {
new_buffer[i].foo();
}
for (size_t i = 0; i != size; ++i) {
new_buffer[i].bar();
}
}
///////////////
It produces the following assembly:
relocate(shared_ptr_like<int>*, unsigned long): #
@relocate(shared_ptr_like<int>*, unsigned long)
push r14
push rbx
push rax
mov r14, rsi
mov rbx, rdi
test r14, r14
je .LBB0_5
lea rdx, [8*r14]
xor esi, esi
mov rdi, rbx
call memset
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov rdi, qword ptr [rbx]
test rdi, rdi
je .LBB0_4
call operator delete(void*)
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
add rbx, 8
dec r14
jne .LBB0_2
.LBB0_5:
add rsp, 8
pop rbx
pop r14
ret
However the optimal assembly would be:
relocate(shared_ptr_like<int>*, unsigned long): #
@relocate(shared_ptr_like<int>*, unsigned long)
mov rax, rsi
test rax, rax
je .LBB0_2
push rax
shl rax, 3
xor esi, esi
mov rdx, rax
call memset
add rsp, 8
.LBB0_2:
ret
Assembly from above could be produced by following code that just hase a single
loop instead of two:
typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, size_t size) {
for (size_t i = 0; i != size; ++i) { // Single loop instead of two
new_buffer[i].foo();
new_buffer[i].bar();
}
}
Optimizing over loop boundries is essential because multiple parts of C++
standard library traverse the same data twice. For example GCC's
std::vector::reserve has the following code:
pointer __tmp = _M_allocate_and_copy(__n,
_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
_M_get_Tp_allocator());
Checked on CLANG 4.0 (tags/RELEASE_400/final 297782) with flags -std=c++1z -O2</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>