[llvm-bugs] [Bug 32036] New: Vectorized horizontal reduction returning wrong result starting at r294934

Tue Feb 21 18:37:58 PST 2017

https://bugs.llvm.org/show_bug.cgi?id=32036

            Bug ID: 32036
           Summary: Vectorized horizontal reduction returning wrong result
                    starting at r294934
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: andrew.b.adams at gmail.com
                CC: llvm-bugs at lists.llvm.org

Created attachment 18015
  --> https://bugs.llvm.org/attachment.cgi?id=18015&action=edit
ll that reproduces

The attached .ll generates code that computes different values before and after
r294934

The reduction it implements is roughly:

int val = 1;
for (int y = 0; y < 8; y++) {
    for (int x = 0; x < 8; x++) {
         val = val + input[y*8 + x] + 3;
     }
}

The output value is precisely 168 = 7*8*3 times smaller than it ought to be, so
perhaps the +3 is getting lost on all but the first loop iteration?

The inner loop before that commit is:

        movl    (%r14,%rsi), %edx
        movl    4(%r14,%rsi), %ebp
        cmpl    %r8d, %edx
        cmovgl  %r8d, %edx
        addl    (%rbx,%rsi), %eax
        addl    4(%rbx,%rsi), %eax
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        addl    8(%rbx,%rsi), %eax
        movl    8(%r14,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        addl    12(%rbx,%rsi), %eax
        movl    12(%r14,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        addl    16(%rbx,%rsi), %eax
        movl    16(%r14,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        addl    20(%rbx,%rsi), %eax
        movl    20(%r14,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        addl    24(%rbx,%rsi), %eax
        movl    24(%r14,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    28(%rbx,%rsi), %ebp
        movl    28(%r14,%rsi), %r9d
        cmpl    %edx, %r9d
        movl    %edx, %r8d
        cmovlel %r9d, %r8d
        leal    24(%rbp,%rax), %eax
        addq    $32, %rsi
        cmpq    $256, %rsi 

The loop over x has been fully unrolled. This inner loop is also computing a
min reduction, so ignore the cmovlel instructions. The relevant instructions
for the summation are:

        addl    (%rbx,%rsi), %eax
        addl    4(%rbx,%rsi), %eax
        addl    8(%rbx,%rsi), %eax
        addl    12(%rbx,%rsi), %eax
        addl    16(%rbx,%rsi), %eax
        addl    20(%rbx,%rsi), %eax
        addl    24(%rbx,%rsi), %eax
        movl    28(%rbx,%rsi), %ebp
        leal    24(%rbp,%rax), %eax

The first 7 values are added, and then the last value is loaded into ebp, and
then added using an leal, with the constant term (8*3 = 24) accounted for in
the leal. This is correct.

The inner loop after that commit is:

        movl    (%rbx,%rsi), %edx
        movl    4(%rbx,%rsi), %r13d
        cmpl    %r8d, %edx
        cmovgl  %r8d, %edx
        vmovdqu (%r14,%rsi), %ymm0
        cmpl    %edx, %r13d
        cmovlel %r13d, %edx
        movl    8(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    12(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    16(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    20(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    24(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        cmovlel %ebp, %edx
        movl    28(%rbx,%rsi), %ebp
        cmpl    %edx, %ebp
        movl    %edx, %r8d
        cmovlel %ebp, %r8d
        vextracti128    $1, %ymm0, %xmm1
        vpaddd  %ymm1, %ymm0, %ymm0
        vpshufd $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
        vpaddd  %ymm1, %ymm0, %ymm0
        vphaddd %ymm0, %ymm0, %ymm0
        vmovd   %xmm0, %edx
        leal    3(%rdx,%rax), %eax
        addq    $32, %rsi
        cmpq    $256, %rsi              # imm = 0x100

The relevant instructions for the summation are:

        vmovdqu (%r14,%rsi), %ymm0
        vextracti128    $1, %ymm0, %xmm1
        vpaddd  %ymm1, %ymm0, %ymm0
        vpshufd $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
        vpaddd  %ymm1, %ymm0, %ymm0
        vphaddd %ymm0, %ymm0, %ymm0
        vmovd   %xmm0, %edx
        leal    3(%rdx,%rax), %eax

All values are loaded at once into a vector, and then they are horizontally
reduced into a single value, to which 3 is added using leal. Ah-hah! That
should be 24, not 3.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170222/ce817539/attachment.html>