<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - loop unrolling fails to properly optimize the remainder"
   href="https://bugs.llvm.org/show_bug.cgi?id=42900">42900</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>loop unrolling fails to properly optimize the remainder
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>clang
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>LLVM Codegen
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedclangbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>mschwar42@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk
          </td>
        </tr></table>
      <p>
        <div>
        <pre>when telling clang to unroll a loop using the pragma unroll, one would expect
the remainder (i.e. N % unroll_factor) to be taken care of outside of the
unrolled loop. Instead clang decides to include this check in each chunk of the
unrolled loop.

int cmp(const char *s1, const char *s2, size_t n){
  unsigned char c1 = '\0';
  unsigned char c2 = '\0';
  #pragma unroll 4
  for(size_t i = 0; i < n; i++){
    c1 = (unsigned char) *s1++;
    c2 = (unsigned char) *s2++;
    if (c1 == '\0' || c1 != c2) return c1 - c2;
  }
  return c1 - c2;
}

produces chunks like this:
        movzx   r11d, byte ptr [rdi + rax]
        movzx   ecx, byte ptr [rsi + rax]
        test    r11b, r11b
        je      .LBB0_10
        cmp     r11b, cl
        jne     .LBB0_10
        cmp     r10, rax
        je      .LBB0_6

instead of this:

        movzx   eax, BYTE PTR [rdi+rcx]
        movzx   r8d, BYTE PTR [rsi+rcx]
        test    al, al
        je      .L32
        cmp     al, r8b
        jne     .L32

comparison between gcc and clang:
<a href="https://godbolt.org/z/ZTADbh">https://godbolt.org/z/ZTADbh</a></pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>