<html>

    <head>

      <base href="http://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Usage of a copy of a register just after a mov instruction"

   href="http://llvm.org/bugs/show_bug.cgi?id=18605">18605</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Usage of a copy of a register just after a mov instruction

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>jn@sirrida.de

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Using a copy of a register just after a mov instruction may cost an extra cycle

on all but the very newest x86 processors (4th generation Intel Core).

On superscalar processors a mov instruction and a modification of the *source*

register thereof can be executed in parallel.

Typically for every line of the C code below we can reduce the number of used

cycles from 3 to 2 (measured on e.g. Intel i7 920, Intel Atom N450).

Even if the "correct order" is given the "wrong order" is produced.

Interestingly both GCC and ICC also show this strange behavior;

is there any reason to do it this way?

int test(int x) {

  int y;

  x ^= (x >> 2);

  x = (x >> 3) ^ x;

  x = x ^ (x >> 4);

  y = x;  x >>= 5;  x ^= y;  // almost the same but explicit

  return x;

  }

=>

    movl    %edi, %eax

    sarl    $2, %eax  // => sarl    $2, %edi

    xorl    %edi, %eax

    movl    %eax, %ecx

    sarl    $3, %ecx  // => sarl    $2, %eax

    xorl    %eax, %ecx

    movl    %ecx, %edx

    sarl    $4, %edx  // => sarl    $2, %ecx

    xorl    %ecx, %edx

    movl    %edx, %eax

    sarl    $5, %eax  // => sarl    $2, %edx

    xorl    %edx, %eax

    retq</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>