[llvm-bugs] [Bug 28146] New: A zexted setcc generates a setcc + movzbl instead of xor + setcc
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Jun 15 14:05:24 PDT 2016
https://llvm.org/bugs/show_bug.cgi?id=28146
Bug ID: 28146
Summary: A zexted setcc generates a setcc + movzbl instead of
xor + setcc
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: mkuper at google.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Consider:
#include <stdio.h>
int main() {
unsigned x = 0;
unsigned y = 0;
#pragma nounroll
for (unsigned i = 0; i < 1000000000; ++i) {
y += x ^ 13;
x += ((i + 100) >= 1000) * 3;
}
return y;
}
We generate:
.text
.globl main
.p2align 4, 0x90
.type main, at function
main:
.cfi_startproc
xorl %eax, %eax
movl $100, %ecx
xorl %edi, %edi
.p2align 4, 0x90
.LBB0_1:
movl %edi, %esi
xorl $13, %esi
addl %esi, %eax
cmpl $999, %ecx
seta %dl # <===
movzbl %dl, %edx # <===
leal (%rdx,%rdx,2), %edx
addl %edx, %edi
incl %ecx
cmpl $1000000100, %ecx
jne .LBB0_1
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
Instead of:
.text
.globl main
.p2align 4, 0x90
.type main, at function
main:
.cfi_startproc
xorl %eax, %eax
movl $100, %ecx
xorl %edi, %edi
.p2align 4, 0x90
.LBB0_1:
movl %edi, %esi
xorl $13, %esi
addl %esi, %eax
xorl %edx, %edx # <===
cmpl $999, %ecx
seta %dl # <===
leal (%rdx,%rdx,2), %edx
addl %edx, %edi
incl %ecx
cmpl $1000000100, %ecx
jne .LBB0_1
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
The xor encodes smaller than the movzbl, which in itself is a good reason to
generate the former. However, there is a more surprising performance issue -
even though both versions ought to avoid partial register stalls, using the xor
idiom turns out to be much faster.
On a Haswell machine:
$ bin/clang -O2 ~/llvm/temp/setcc.s -o ~/llvm/temp/setcc.exe && time
~/llvm/temp/setcc.exe
real 0m1.045s
user 0m1.043s
sys 0m0.001s
$ bin/clang -O2 ~/llvm/temp/setcc-faster.s -o ~/llvm/temp/setcc.exe && time
~/llvm/temp/setcc.exe
real 0m0.876s
user 0m0.874s
sys 0m0.002s
Could someone at Intel confirm that this is expected? IACA doesn't show
significant stalling for the slower version, but it exists in practice (for the
slower version, about ~15% stalls, and this can be significantly increased by
making the dependency chain longer.)
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160615/3c5e5141/attachment.html>
More information about the llvm-bugs
mailing list