[llvm-bugs] [Bug 28146] New: A zexted setcc generates a setcc + movzbl instead of xor + setcc

Wed Jun 15 14:05:24 PDT 2016

https://llvm.org/bugs/show_bug.cgi?id=28146

            Bug ID: 28146
           Summary: A zexted setcc generates a setcc + movzbl instead of
                    xor + setcc
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: mkuper at google.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider:

#include <stdio.h>
int main() {
  unsigned x = 0;
  unsigned y = 0;
#pragma nounroll
  for (unsigned i = 0; i < 1000000000; ++i) {
    y += x ^ 13;
    x += ((i + 100) >= 1000) * 3;
  }
  return y;
}

We generate:

    .text
    .globl    main
    .p2align    4, 0x90
    .type    main, at function
main:
    .cfi_startproc
    xorl    %eax, %eax
    movl    $100, %ecx
    xorl    %edi, %edi
    .p2align    4, 0x90
.LBB0_1:
    movl    %edi, %esi
    xorl    $13, %esi
    addl    %esi, %eax
    cmpl    $999, %ecx
    seta    %dl                     # <===
    movzbl    %dl, %edx               # <===
    leal    (%rdx,%rdx,2), %edx
    addl    %edx, %edi
    incl    %ecx
    cmpl    $1000000100, %ecx
    jne    .LBB0_1
    retq
.Lfunc_end0:
    .size    main, .Lfunc_end0-main
    .cfi_endproc

Instead of:

  .text
  .globl  main
  .p2align  4, 0x90
  .type main, at function
main:
  .cfi_startproc
  xorl  %eax, %eax
  movl  $100, %ecx
  xorl  %edi, %edi
  .p2align  4, 0x90
.LBB0_1:
  movl  %edi, %esi
  xorl  $13, %esi
  addl  %esi, %eax
  xorl  %edx, %edx              # <===
  cmpl  $999, %ecx
  seta  %dl                     # <===
  leal  (%rdx,%rdx,2), %edx
  addl  %edx, %edi
  incl  %ecx
  cmpl  $1000000100, %ecx
  jne .LBB0_1
  retq
.Lfunc_end0:
  .size main, .Lfunc_end0-main
  .cfi_endproc

The xor encodes smaller than the movzbl, which in itself is a good reason to
generate the former. However, there is a more surprising performance issue -
even though both versions ought to avoid partial register stalls, using the xor
idiom turns out to be much faster.
On a Haswell machine:

$ bin/clang -O2 ~/llvm/temp/setcc.s -o ~/llvm/temp/setcc.exe && time
~/llvm/temp/setcc.exe

real    0m1.045s
user    0m1.043s
sys    0m0.001s

$ bin/clang -O2 ~/llvm/temp/setcc-faster.s -o ~/llvm/temp/setcc.exe && time
~/llvm/temp/setcc.exe

real    0m0.876s
user    0m0.874s
sys    0m0.002s

Could someone at Intel confirm that this is expected? IACA doesn't show
significant stalling for the slower version, but it exists in practice (for the
slower version, about ~15% stalls, and this can be significantly increased by
making the dependency chain longer.)

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160615/3c5e5141/attachment.html>