[llvm-bugs] [Bug 52464] New: Avoid vectorization of very small loops

via llvm-bugs llvm-bugs at lists.llvm.org
Wed Nov 10 01:01:51 PST 2021


https://bugs.llvm.org/show_bug.cgi?id=52464

            Bug ID: 52464
           Summary: Avoid vectorization of very small loops
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: david.bolvansky at gmail.com
                CC: llvm-bugs at lists.llvm.org

int foo (int n, int *addr)
{
  int count, sum = 0;
  for ( count = n & 0x3; count > 0; count--, addr++ )
    sum += *addr;
  return sum;
}

int foo0 (int n, int *addr)
{
  int count, sum = 0;
  for ( count = n & 0x3; count >= 0; count--, addr++ )
    sum += *addr;
  return sum;
}


GCC -O3
foo(int, int*):
  mov eax, edi
  and eax, 3
  je .L1
  mov edx, DWORD PTR [rsi]
  cmp eax, 1
  je .L4
  add edx, DWORD PTR [rsi+4]
  cmp eax, 2
  mov eax, edx
  je .L1
  add eax, DWORD PTR [rsi+8]
  ret
.L1:
  ret
.L4:
  mov eax, edx
  ret
foo0(int, int*):
  mov eax, DWORD PTR [rsi]
  and edi, 3
  je .L10
  add eax, DWORD PTR [rsi+4]
  cmp edi, 1
  je .L10
  add eax, DWORD PTR [rsi+8]
  cmp edi, 2
  je .L10
  add eax, DWORD PTR [rsi+12]
.L10:
  ret



LLVM produces a huge amount of vectorized code:
.LCPI0_0:
        .quad   0                               # 0x0
        .quad   1                               # 0x1
        .quad   2                               # 0x2
        .quad   3                               # 0x3
.LCPI0_1:
        .quad   -9223372036854775808            # 0x8000000000000000
foo(int, int*):                              # @foo(int, int*)
        and     edi, 3
        je      .LBB0_1
        dec     edi
        lea     rcx, [rdi + 4]
        and     rcx, -4
        vmovq   xmm0, rdi
        vpbroadcastq    ymm8, xmm0
        add     rcx, -4
        mov     rdx, rcx
        shr     rdx, 2
        inc     rdx
        mov     eax, edx
        and     eax, 3
        cmp     rcx, 12
        jae     .LBB0_4
        vpxor   xmm1, xmm1, xmm1
        xor     ecx, ecx
        jmp     .LBB0_6
.LBB0_1:
        xor     eax, eax
        ret
.LBB0_4:
        lea     r8, [rsi + 48]
        and     rdx, -4
        neg     rdx
        vpxor   xmm1, xmm1, xmm1
        xor     ecx, ecx
        vmovdqa ymm9, ymmword ptr [rip + .LCPI0_0] # ymm9 = [0,1,2,3]
        vpbroadcastq    ymm3, qword ptr [rip + .LCPI0_1] # ymm3 =
[9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
        vpor    ymm4, ymm8, ymm3
        vpcmpeqd        ymm5, ymm5, ymm5
.LBB0_5:                                # =>This Inner Loop Header: Depth=1
        vmovq   xmm6, rcx
        vpbroadcastq    ymm6, xmm6
        vpor    ymm6, ymm9, ymm6
        vpxor   ymm6, ymm6, ymm3
        vpcmpgtq        ymm6, ymm6, ymm4
        vpxor   ymm6, ymm6, ymm5
        vextracti128    xmm7, ymm6, 1
        vpackssdw       xmm6, xmm6, xmm7
        vpmaskmovd      xmm6, xmm6, xmmword ptr [r8 + 4*rcx - 48]
        vpaddd  xmm1, xmm6, xmm1
        lea     rdi, [rcx + 4]
        vmovq   xmm6, rdi
        vpbroadcastq    ymm6, xmm6
        vpor    ymm6, ymm9, ymm6
        vpxor   ymm6, ymm6, ymm3
        vpcmpgtq        ymm6, ymm6, ymm4
        vpxor   ymm6, ymm6, ymm5
        vextracti128    xmm7, ymm6, 1
        vpackssdw       xmm6, xmm6, xmm7
        vpmaskmovd      xmm6, xmm6, xmmword ptr [r8 + 4*rcx - 32]
        lea     rdi, [rcx + 8]
        vmovq   xmm7, rdi
        vpbroadcastq    ymm7, xmm7
        vpor    ymm7, ymm9, ymm7
        vpxor   ymm7, ymm7, ymm3
        vpcmpgtq        ymm7, ymm7, ymm4
        vpxor   ymm7, ymm7, ymm5
        vextracti128    xmm0, ymm7, 1
        vpackssdw       xmm0, xmm7, xmm0
        vpmaskmovd      xmm0, xmm0, xmmword ptr [r8 + 4*rcx - 16]
        vpaddd  xmm0, xmm6, xmm0
        lea     rdi, [rcx + 12]
        vmovq   xmm6, rdi
        vpbroadcastq    ymm6, xmm6
        vpor    ymm6, ymm9, ymm6
        vpxor   ymm6, ymm6, ymm3
        vpcmpgtq        ymm6, ymm6, ymm4
        vpxor   ymm6, ymm6, ymm5
        vextracti128    xmm7, ymm6, 1
        vpackssdw       xmm6, xmm6, xmm7
        vpmaskmovd      xmm2, xmm6, xmmword ptr [r8 + 4*rcx]
        vpaddd  xmm7, xmm1, xmm0
        vpaddd  xmm1, xmm2, xmm7
        add     rcx, 16
        add     rdx, 4
        jne     .LBB0_5
.LBB0_6:
        test    rax, rax
        je      .LBB0_9
        neg     rax
        vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] # ymm2 = [0,1,2,3]
        vpbroadcastq    ymm3, qword ptr [rip + .LCPI0_1] # ymm3 =
[9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
        vpor    ymm0, ymm8, ymm3
        vpcmpeqd        ymm4, ymm4, ymm4
.LBB0_8:                                # =>This Inner Loop Header: Depth=1
        vmovq   xmm5, rcx
        vpbroadcastq    ymm5, xmm5
        vpor    ymm5, ymm5, ymm2
        vpxor   ymm5, ymm5, ymm3
        vpcmpgtq        ymm5, ymm5, ymm0
        vpxor   ymm5, ymm5, ymm4
        vextracti128    xmm6, ymm5, 1
        vpackssdw       xmm6, xmm5, xmm6
        vpmaskmovd      xmm5, xmm6, xmmword ptr [rsi + 4*rcx]
        vmovdqa xmm7, xmm1
        vpaddd  xmm1, xmm5, xmm1
        add     rcx, 4
        inc     rax
        jne     .LBB0_8
.LBB0_9:
        vpslld  xmm0, xmm6, 31
        vblendvps       xmm0, xmm7, xmm1, xmm0
        vpermilps       xmm1, xmm0, 238         # xmm1 = xmm0[2,3,2,3]
        vpaddd  xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 85                  # xmm1 = xmm0[1,1,1,1]
        vpaddd  xmm0, xmm0, xmm1
        vmovd   eax, xmm0
        vzeroupper
        ret
.LCPI1_0:
        .quad   0                               # 0x0
        .quad   1                               # 0x1
        .quad   2                               # 0x2
        .quad   3                               # 0x3
foo0(int, int*):                             # @foo0(int, int*)
        and     edi, 3
        vmovq   xmm0, rdi
        vpbroadcastq    ymm0, xmm0
        vmovdqa ymm1, ymmword ptr [rip + .LCPI1_0] # ymm1 = [0,1,2,3]
        vpcmpgtq        ymm0, ymm1, ymm0
        vpcmpeqd        ymm1, ymm1, ymm1
        vpxor   ymm0, ymm0, ymm1
        vextracti128    xmm1, ymm0, 1
        vpackssdw       xmm0, xmm0, xmm1
        vpmaskmovd      xmm0, xmm0, xmmword ptr [rsi]
        vpshufd xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        vpaddd  xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 85                  # xmm1 = xmm0[1,1,1,1]
        vpaddd  xmm0, xmm0, xmm1
        vmovd   eax, xmm0
        vzeroupper
        ret


Harder case:
int foo2x (int n, int *addr)
{
  int count, sum = 0;
  for ( count = (n & 0x3) * 2; count > 0; count--, addr++ )
    sum += *addr;
  return sum;
}

GCC:
foo2x(int, int*):
  mov eax, edi
  and eax, 3
  je .L1
  mov edx, DWORD PTR [rsi+4]
  add edx, DWORD PTR [rsi]
  cmp eax, 1
  je .L5
  add edx, DWORD PTR [rsi+8]
  add edx, DWORD PTR [rsi+12]
  cmp eax, 2
  je .L5
  mov eax, DWORD PTR [rsi+20]
  add edx, DWORD PTR [rsi+16]
  add eax, edx
  ret
.L5:
  mov eax, edx
.L1:
  ret


https://godbolt.org/z/8hnoKPvob

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211110/e58b004b/attachment-0001.html>


More information about the llvm-bugs mailing list