<html>
    <head>
      <base href="http://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - LLVM generating inefficient code for simple loops"
   href="http://llvm.org/bugs/show_bug.cgi?id=22369">22369</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>LLVM generating inefficient code for simple loops
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Loop Optimizer
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>djasper@google.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvmbugs@cs.uiuc.edu
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider the function:

uint8 *f(uint32 value, uint8 *target) {
  while (value >= 0x80) {
    *target = static_cast<uint8>(value | 0x80);
    value >>= 7;
    ++target;
  }
  *target = static_cast<uint8>(value);
  return target + 1;
}

For, this function, "clang -O2 -S" generates:

        cmpl    $128, %edi
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        movl    %edi, %eax
        shrl    $7, %eax
        incq    %rsi
        cmpl    $16383, %edi            # imm = 0x3FFF
        movl    %eax, %edi
        ja      .LBB0_2
        jmp     .LBB0_3
.LBB0_1:
        movl    %edi, %eax
.LBB0_3:                                # %while.end
        movb    %al, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq

Which seems quite inefficient. There are several unnecessary moves. GCC instead
translates this to:

        jmp     .L4
        .p2align 4,,10
        .p2align 3
.L3:
        movl    %edi, %eax
        addq    $1, %rsi
        shrl    $7, %edi
        orl     $-128, %eax
        movb    %al, -1(%rsi)
.L4:
        cmpl    $127, %edi
        ja      .L3
        movb    %dil, (%rsi)
        leaq    1(%rsi), %rax
        ret

The LLVM IR generated by clang here already generates code that is very similar
to the final output (with instcombine folding the shift into the compare). If I
prevent this folding by adding "if (!Shr->hasOneUse()) return nullptr;" in
InstCombiner::FoldICmpShrCst()
(lib/Transforms/InstCombine/InstCombineCompares.cpp), then LLVM's code gets
somewhat better, but I suspect that LLVM should really be able to lower the IR
better.</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>