[llvm-bugs] [Bug 31690] New: Simple addition reduction not vectorised when limit is 48 or less.

Thu Jan 19 02:18:01 PST 2017

https://llvm.org/bugs/show_bug.cgi?id=31690

            Bug ID: 31690
           Summary: Simple addition reduction not vectorised when limit is
                    48 or less.
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: drraph at gmail.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider this simple loop

float f(float x[]) {
  float p = 1.0;
  for (int i = 0; i < 48; i++)
    p += x[i];
  return p;
}

clang 3.9.1 gives

.LCPI0_0:
        .long   1065353216              # float 1
f:                                      # @f
        vmovss  xmm0, dword ptr [rdi]   # xmm0 = mem[0],zero,zero,zero
        vaddss  xmm0, xmm0, dword ptr [rip + .LCPI0_0]
        vaddss  xmm0, xmm0, dword ptr [rdi + 4]
        vaddss  xmm0, xmm0, dword ptr [rdi + 8]
        vaddss  xmm0, xmm0, dword ptr [rdi + 12]
        vaddss  xmm0, xmm0, dword ptr [rdi + 16]
        vaddss  xmm0, xmm0, dword ptr [rdi + 20]
        vaddss  xmm0, xmm0, dword ptr [rdi + 24]
        vaddss  xmm0, xmm0, dword ptr [rdi + 28]
        vaddss  xmm0, xmm0, dword ptr [rdi + 32]
        vaddss  xmm0, xmm0, dword ptr [rdi + 36]
        vaddss  xmm0, xmm0, dword ptr [rdi + 40]
        vaddss  xmm0, xmm0, dword ptr [rdi + 44]
        vaddss  xmm0, xmm0, dword ptr [rdi + 48]
        vaddss  xmm0, xmm0, dword ptr [rdi + 52]
        vaddss  xmm0, xmm0, dword ptr [rdi + 56]
        vaddss  xmm0, xmm0, dword ptr [rdi + 60]
        vaddss  xmm0, xmm0, dword ptr [rdi + 64]
        vaddss  xmm0, xmm0, dword ptr [rdi + 68]
        vaddss  xmm0, xmm0, dword ptr [rdi + 72]
        vaddss  xmm0, xmm0, dword ptr [rdi + 76]
        vaddss  xmm0, xmm0, dword ptr [rdi + 80]
        vaddss  xmm0, xmm0, dword ptr [rdi + 84]
        vaddss  xmm0, xmm0, dword ptr [rdi + 88]
        vaddss  xmm0, xmm0, dword ptr [rdi + 92]
        vaddss  xmm0, xmm0, dword ptr [rdi + 96]
        vaddss  xmm0, xmm0, dword ptr [rdi + 100]
        vaddss  xmm0, xmm0, dword ptr [rdi + 104]
        vaddss  xmm0, xmm0, dword ptr [rdi + 108]
        vaddss  xmm0, xmm0, dword ptr [rdi + 112]
        vaddss  xmm0, xmm0, dword ptr [rdi + 116]
        vaddss  xmm0, xmm0, dword ptr [rdi + 120]
        vaddss  xmm0, xmm0, dword ptr [rdi + 124]
        vaddss  xmm0, xmm0, dword ptr [rdi + 128]
        vaddss  xmm0, xmm0, dword ptr [rdi + 132]
        vaddss  xmm0, xmm0, dword ptr [rdi + 136]
        vaddss  xmm0, xmm0, dword ptr [rdi + 140]
        vaddss  xmm0, xmm0, dword ptr [rdi + 144]
        vaddss  xmm0, xmm0, dword ptr [rdi + 148]
        vaddss  xmm0, xmm0, dword ptr [rdi + 152]
        vaddss  xmm0, xmm0, dword ptr [rdi + 156]
        vaddss  xmm0, xmm0, dword ptr [rdi + 160]
        vaddss  xmm0, xmm0, dword ptr [rdi + 164]
        vaddss  xmm0, xmm0, dword ptr [rdi + 168]
        vaddss  xmm0, xmm0, dword ptr [rdi + 172]
        vaddss  xmm0, xmm0, dword ptr [rdi + 176]
        vaddss  xmm0, xmm0, dword ptr [rdi + 180]
        vaddss  xmm0, xmm0, dword ptr [rdi + 184]
        vaddss  xmm0, xmm0, dword ptr [rdi + 188]
        ret

This is fully unrolled but not vectorized.

However, using icc we get:

f:
        movups    xmm13, XMMWORD PTR [rdi]                      #4.10
        movups    xmm0, XMMWORD PTR [16+rdi]                    #4.10
        movups    xmm6, XMMWORD PTR [32+rdi]                    #4.10
        movups    xmm1, XMMWORD PTR [48+rdi]                    #4.10
        movups    xmm9, XMMWORD PTR [64+rdi]                    #4.10
        movups    xmm2, XMMWORD PTR [80+rdi]                    #4.10
        movups    xmm7, XMMWORD PTR [96+rdi]                    #4.10
        movups    xmm3, XMMWORD PTR [112+rdi]                   #4.10
        movups    xmm10, XMMWORD PTR [128+rdi]                  #4.10
        movups    xmm4, XMMWORD PTR [144+rdi]                   #4.10
        movups    xmm8, XMMWORD PTR [160+rdi]                   #4.10
        movups    xmm5, XMMWORD PTR [176+rdi]                   #4.10
        addps     xmm13, xmm0                                   #2.11
        addps     xmm6, xmm1                                    #2.11
        addps     xmm9, xmm2                                    #2.11
        addps     xmm7, xmm3                                    #2.11
        addps     xmm10, xmm4                                   #2.11
        addps     xmm8, xmm5                                    #2.11
        addps     xmm13, xmm6                                   #2.11
        addps     xmm9, xmm7                                    #2.11
        addps     xmm10, xmm8                                   #2.11
        addps     xmm13, xmm9                                   #2.11
        addps     xmm13, xmm10                                  #2.11
        movaps    xmm11, xmm13                                  #2.11
        movhlps   xmm11, xmm13                                  #2.11
        addps     xmm13, xmm11                                  #2.11
        movaps    xmm12, xmm13                                  #2.11
        shufps    xmm12, xmm13, 245                             #2.11
        addss     xmm13, xmm12                                  #2.11
        addss     xmm13, DWORD PTR .L_2il0floatpacket.0[rip]    #2.11
        movaps    xmm0, xmm13                                   #5.10
        ret                                                     #5.10
.L_2il0floatpacket.0:
        .long   0x3f800000

This is both vectorised and unrolled.

If you increase the loop limit from 48 to 64, say, then clang/llvm stops
unrolling and does manage to vectorise the code.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170119/a4800197/attachment.html>