<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Optimization for removing same variable comparisons in loop: while(it != end1 && it != end2)"
   href="https://bugs.llvm.org/show_bug.cgi?id=33813">33813</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Optimization for removing same variable comparisons in loop: while(it != end1 && it != end2)
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Loop Optimizer
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>antoshkka@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Simple iteration by std::deque elements produces suboptimal code. For example 

#include <deque>

unsigned sum(std::deque<unsigned> cont) {
    unsigned sum = 0;
    for (unsigned v : cont)
        sum += v;

    return sum;
}


produces the following loop:

.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        cmp     rsi, rcx
        je      .LBB0_4
        add     eax, dword ptr [rcx]
        add     rcx, 4
        cmp     rcx, rdx
        jne     .LBB0_1
        jmp     .LBB0_3

The loop has two comparisons in it and behaves close to the following C code:

unsigned sum_like_deque_does(unsigned** chunks, unsigned* end) {
    unsigned sum = 0;

    for (unsigned* it = *chunks; it != end; it = *(++chunks)) {
        for (;it != end && it != *chunks + 128; ++it) {
            sum += *it;
        }
    }

    return sum;
}


Note the `it != end && it != *chunks + 128` condition. It could be simplified:
if `end` belongs to `[it, *chunks + 128]` change the condition to `it != end`
and use the condition `it != *chunks + 128` otherwise. Such optimization
removes the cmp from the loop and produces a much more faster loop:

.LBB2_3:                                #   Parent Loop BB2_2 Depth=1
        add     eax, dword ptr [rcx]
        add     rcx, 4
        cmp     rdx, rcx
        jne     .LBB2_3

Synthetic tests show up to 2 times better performance. Assembly outputs:
<a href="https://godbolt.org/g/vGs2qs">https://godbolt.org/g/vGs2qs</a></pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>