[llvm-bugs] [Bug 32036] New: Vectorized horizontal reduction returning wrong result starting at r294934
via llvm-bugs
llvm-bugs at lists.llvm.org
Tue Feb 21 18:37:58 PST 2017
https://bugs.llvm.org/show_bug.cgi?id=32036
Bug ID: 32036
Summary: Vectorized horizontal reduction returning wrong result
starting at r294934
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: andrew.b.adams at gmail.com
CC: llvm-bugs at lists.llvm.org
Created attachment 18015
--> https://bugs.llvm.org/attachment.cgi?id=18015&action=edit
ll that reproduces
The attached .ll generates code that computes different values before and after
r294934
The reduction it implements is roughly:
int val = 1;
for (int y = 0; y < 8; y++) {
for (int x = 0; x < 8; x++) {
val = val + input[y*8 + x] + 3;
}
}
The output value is precisely 168 = 7*8*3 times smaller than it ought to be, so
perhaps the +3 is getting lost on all but the first loop iteration?
The inner loop before that commit is:
movl (%r14,%rsi), %edx
movl 4(%r14,%rsi), %ebp
cmpl %r8d, %edx
cmovgl %r8d, %edx
addl (%rbx,%rsi), %eax
addl 4(%rbx,%rsi), %eax
cmpl %edx, %ebp
cmovlel %ebp, %edx
addl 8(%rbx,%rsi), %eax
movl 8(%r14,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
addl 12(%rbx,%rsi), %eax
movl 12(%r14,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
addl 16(%rbx,%rsi), %eax
movl 16(%r14,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
addl 20(%rbx,%rsi), %eax
movl 20(%r14,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
addl 24(%rbx,%rsi), %eax
movl 24(%r14,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 28(%rbx,%rsi), %ebp
movl 28(%r14,%rsi), %r9d
cmpl %edx, %r9d
movl %edx, %r8d
cmovlel %r9d, %r8d
leal 24(%rbp,%rax), %eax
addq $32, %rsi
cmpq $256, %rsi
The loop over x has been fully unrolled. This inner loop is also computing a
min reduction, so ignore the cmovlel instructions. The relevant instructions
for the summation are:
addl (%rbx,%rsi), %eax
addl 4(%rbx,%rsi), %eax
addl 8(%rbx,%rsi), %eax
addl 12(%rbx,%rsi), %eax
addl 16(%rbx,%rsi), %eax
addl 20(%rbx,%rsi), %eax
addl 24(%rbx,%rsi), %eax
movl 28(%rbx,%rsi), %ebp
leal 24(%rbp,%rax), %eax
The first 7 values are added, and then the last value is loaded into ebp, and
then added using an leal, with the constant term (8*3 = 24) accounted for in
the leal. This is correct.
The inner loop after that commit is:
movl (%rbx,%rsi), %edx
movl 4(%rbx,%rsi), %r13d
cmpl %r8d, %edx
cmovgl %r8d, %edx
vmovdqu (%r14,%rsi), %ymm0
cmpl %edx, %r13d
cmovlel %r13d, %edx
movl 8(%rbx,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 12(%rbx,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 16(%rbx,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 20(%rbx,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 24(%rbx,%rsi), %ebp
cmpl %edx, %ebp
cmovlel %ebp, %edx
movl 28(%rbx,%rsi), %ebp
cmpl %edx, %ebp
movl %edx, %r8d
cmovlel %ebp, %r8d
vextracti128 $1, %ymm0, %xmm1
vpaddd %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
vpaddd %ymm1, %ymm0, %ymm0
vphaddd %ymm0, %ymm0, %ymm0
vmovd %xmm0, %edx
leal 3(%rdx,%rax), %eax
addq $32, %rsi
cmpq $256, %rsi # imm = 0x100
The relevant instructions for the summation are:
vmovdqu (%r14,%rsi), %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddd %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
vpaddd %ymm1, %ymm0, %ymm0
vphaddd %ymm0, %ymm0, %ymm0
vmovd %xmm0, %edx
leal 3(%rdx,%rax), %eax
All values are loaded at once into a vector, and then they are horizontally
reduced into a single value, to which 3 is added using leal. Ah-hah! That
should be 24, not 3.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170222/ce817539/attachment.html>
More information about the llvm-bugs
mailing list