[llvm-bugs] [Bug 52464] New: Avoid vectorization of very small loops
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Nov 10 01:01:51 PST 2021
https://bugs.llvm.org/show_bug.cgi?id=52464
Bug ID: 52464
Summary: Avoid vectorization of very small loops
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: david.bolvansky at gmail.com
CC: llvm-bugs at lists.llvm.org
int foo (int n, int *addr)
{
int count, sum = 0;
for ( count = n & 0x3; count > 0; count--, addr++ )
sum += *addr;
return sum;
}
int foo0 (int n, int *addr)
{
int count, sum = 0;
for ( count = n & 0x3; count >= 0; count--, addr++ )
sum += *addr;
return sum;
}
GCC -O3
foo(int, int*):
mov eax, edi
and eax, 3
je .L1
mov edx, DWORD PTR [rsi]
cmp eax, 1
je .L4
add edx, DWORD PTR [rsi+4]
cmp eax, 2
mov eax, edx
je .L1
add eax, DWORD PTR [rsi+8]
ret
.L1:
ret
.L4:
mov eax, edx
ret
foo0(int, int*):
mov eax, DWORD PTR [rsi]
and edi, 3
je .L10
add eax, DWORD PTR [rsi+4]
cmp edi, 1
je .L10
add eax, DWORD PTR [rsi+8]
cmp edi, 2
je .L10
add eax, DWORD PTR [rsi+12]
.L10:
ret
LLVM produces a huge amount of vectorized code:
.LCPI0_0:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.LCPI0_1:
.quad -9223372036854775808 # 0x8000000000000000
foo(int, int*): # @foo(int, int*)
and edi, 3
je .LBB0_1
dec edi
lea rcx, [rdi + 4]
and rcx, -4
vmovq xmm0, rdi
vpbroadcastq ymm8, xmm0
add rcx, -4
mov rdx, rcx
shr rdx, 2
inc rdx
mov eax, edx
and eax, 3
cmp rcx, 12
jae .LBB0_4
vpxor xmm1, xmm1, xmm1
xor ecx, ecx
jmp .LBB0_6
.LBB0_1:
xor eax, eax
ret
.LBB0_4:
lea r8, [rsi + 48]
and rdx, -4
neg rdx
vpxor xmm1, xmm1, xmm1
xor ecx, ecx
vmovdqa ymm9, ymmword ptr [rip + .LCPI0_0] # ymm9 = [0,1,2,3]
vpbroadcastq ymm3, qword ptr [rip + .LCPI0_1] # ymm3 =
[9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
vpor ymm4, ymm8, ymm3
vpcmpeqd ymm5, ymm5, ymm5
.LBB0_5: # =>This Inner Loop Header: Depth=1
vmovq xmm6, rcx
vpbroadcastq ymm6, xmm6
vpor ymm6, ymm9, ymm6
vpxor ymm6, ymm6, ymm3
vpcmpgtq ymm6, ymm6, ymm4
vpxor ymm6, ymm6, ymm5
vextracti128 xmm7, ymm6, 1
vpackssdw xmm6, xmm6, xmm7
vpmaskmovd xmm6, xmm6, xmmword ptr [r8 + 4*rcx - 48]
vpaddd xmm1, xmm6, xmm1
lea rdi, [rcx + 4]
vmovq xmm6, rdi
vpbroadcastq ymm6, xmm6
vpor ymm6, ymm9, ymm6
vpxor ymm6, ymm6, ymm3
vpcmpgtq ymm6, ymm6, ymm4
vpxor ymm6, ymm6, ymm5
vextracti128 xmm7, ymm6, 1
vpackssdw xmm6, xmm6, xmm7
vpmaskmovd xmm6, xmm6, xmmword ptr [r8 + 4*rcx - 32]
lea rdi, [rcx + 8]
vmovq xmm7, rdi
vpbroadcastq ymm7, xmm7
vpor ymm7, ymm9, ymm7
vpxor ymm7, ymm7, ymm3
vpcmpgtq ymm7, ymm7, ymm4
vpxor ymm7, ymm7, ymm5
vextracti128 xmm0, ymm7, 1
vpackssdw xmm0, xmm7, xmm0
vpmaskmovd xmm0, xmm0, xmmword ptr [r8 + 4*rcx - 16]
vpaddd xmm0, xmm6, xmm0
lea rdi, [rcx + 12]
vmovq xmm6, rdi
vpbroadcastq ymm6, xmm6
vpor ymm6, ymm9, ymm6
vpxor ymm6, ymm6, ymm3
vpcmpgtq ymm6, ymm6, ymm4
vpxor ymm6, ymm6, ymm5
vextracti128 xmm7, ymm6, 1
vpackssdw xmm6, xmm6, xmm7
vpmaskmovd xmm2, xmm6, xmmword ptr [r8 + 4*rcx]
vpaddd xmm7, xmm1, xmm0
vpaddd xmm1, xmm2, xmm7
add rcx, 16
add rdx, 4
jne .LBB0_5
.LBB0_6:
test rax, rax
je .LBB0_9
neg rax
vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] # ymm2 = [0,1,2,3]
vpbroadcastq ymm3, qword ptr [rip + .LCPI0_1] # ymm3 =
[9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
vpor ymm0, ymm8, ymm3
vpcmpeqd ymm4, ymm4, ymm4
.LBB0_8: # =>This Inner Loop Header: Depth=1
vmovq xmm5, rcx
vpbroadcastq ymm5, xmm5
vpor ymm5, ymm5, ymm2
vpxor ymm5, ymm5, ymm3
vpcmpgtq ymm5, ymm5, ymm0
vpxor ymm5, ymm5, ymm4
vextracti128 xmm6, ymm5, 1
vpackssdw xmm6, xmm5, xmm6
vpmaskmovd xmm5, xmm6, xmmword ptr [rsi + 4*rcx]
vmovdqa xmm7, xmm1
vpaddd xmm1, xmm5, xmm1
add rcx, 4
inc rax
jne .LBB0_8
.LBB0_9:
vpslld xmm0, xmm6, 31
vblendvps xmm0, xmm7, xmm1, xmm0
vpermilps xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
vpaddd xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
vpaddd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
ret
.LCPI1_0:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
foo0(int, int*): # @foo0(int, int*)
and edi, 3
vmovq xmm0, rdi
vpbroadcastq ymm0, xmm0
vmovdqa ymm1, ymmword ptr [rip + .LCPI1_0] # ymm1 = [0,1,2,3]
vpcmpgtq ymm0, ymm1, ymm0
vpcmpeqd ymm1, ymm1, ymm1
vpxor ymm0, ymm0, ymm1
vextracti128 xmm1, ymm0, 1
vpackssdw xmm0, xmm0, xmm1
vpmaskmovd xmm0, xmm0, xmmword ptr [rsi]
vpshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
vpaddd xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
vpaddd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
ret
Harder case:
int foo2x (int n, int *addr)
{
int count, sum = 0;
for ( count = (n & 0x3) * 2; count > 0; count--, addr++ )
sum += *addr;
return sum;
}
GCC:
foo2x(int, int*):
mov eax, edi
and eax, 3
je .L1
mov edx, DWORD PTR [rsi+4]
add edx, DWORD PTR [rsi]
cmp eax, 1
je .L5
add edx, DWORD PTR [rsi+8]
add edx, DWORD PTR [rsi+12]
cmp eax, 2
je .L5
mov eax, DWORD PTR [rsi+20]
add edx, DWORD PTR [rsi+16]
add eax, edx
ret
.L5:
mov eax, edx
.L1:
ret
https://godbolt.org/z/8hnoKPvob
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211110/e58b004b/attachment-0001.html>
More information about the llvm-bugs
mailing list