[llvm-bugs] [Bug 31677] New: clang/llvm fails to vectorize the product of a complex array
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Jan 18 07:10:31 PST 2017
https://llvm.org/bugs/show_bug.cgi?id=31677
Bug ID: 31677
Summary: clang/llvm fails to vectorize the product of a complex
array
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: drraph at gmail.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Consider this simple piece of code which takes the product of an array
of complex numbers.
#include <complex.h>
complex float f(complex float x[]) {
complex float p = 1.0;
for (int i = 0; i < 32; i++)
p *= x[i];
return p;
}
If I compile it with -O3 -march=bdver2 -ffast-math using clang 3.9.1 I get
unvectorised assembly.
.LCPI0_0:
.long 1065353216 # float 1
f: # @f
vxorps xmm1, xmm1, xmm1
vmovss xmm0, dword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero,zero,zero
xor eax, eax
.LBB0_1: # =>This Inner Loop Header: Depth=1
vmovss xmm2, dword ptr [rdi + 8*rax] # xmm2 = mem[0],zero,zero,zero
vmovss xmm3, dword ptr [rdi + 8*rax + 4] # xmm3 =
mem[0],zero,zero,zero
vmulss xmm4, xmm2, xmm1
vmulss xmm5, xmm3, xmm1
vfmaddss xmm1, xmm3, xmm0, xmm4
vfmsubss xmm0, xmm2, xmm0, xmm5
inc rax
cmp rax, 32
jne .LBB0_1
vinsertps xmm0, xmm0, xmm1, 16 # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
ret
As a test I also tried icc (the Intel Compiler) which does appear to
give vectorised code.
f:
xor eax, eax #4.3
movups xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #3.19
..B1.2: # Preds ..B1.2 ..B1.1
movups xmm0, XMMWORD PTR [rdi+rax*8] #5.10
movups xmm4, XMMWORD PTR [32+rdi+rax*8] #5.10
movups xmm6, XMMWORD PTR [48+rdi+rax*8] #5.10
movups xmm8, XMMWORD PTR [64+rdi+rax*8] #5.10
movups xmm10, XMMWORD PTR [80+rdi+rax*8] #5.10
movups xmm12, XMMWORD PTR [96+rdi+rax*8] #5.10
movups xmm14, XMMWORD PTR [112+rdi+rax*8] #5.10
movaps xmm2, xmm0 #5.5
shufps xmm2, xmm0, 160 #5.5
mulps xmm2, xmm1 #5.5
xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm1, xmm1, 177 #5.5
shufps xmm0, xmm0, 245 #5.5
mulps xmm1, xmm0 #5.5
addps xmm2, xmm1 #5.5
movups xmm1, XMMWORD PTR [16+rdi+rax*8] #5.10
movaps xmm3, xmm2 #5.5
add rax, 16 #4.3
shufps xmm3, xmm2, 160 #5.5
mulps xmm3, xmm1 #5.5
xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm1, xmm1, 177 #5.5
shufps xmm2, xmm2, 245 #5.5
mulps xmm1, xmm2 #5.5
addps xmm3, xmm1 #5.5
movaps xmm5, xmm3 #5.5
shufps xmm5, xmm3, 160 #5.5
mulps xmm5, xmm4 #5.5
xorps xmm4, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm4, xmm4, 177 #5.5
shufps xmm3, xmm3, 245 #5.5
mulps xmm4, xmm3 #5.5
addps xmm5, xmm4 #5.5
movaps xmm7, xmm5 #5.5
shufps xmm7, xmm5, 160 #5.5
mulps xmm7, xmm6 #5.5
xorps xmm6, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm6, xmm6, 177 #5.5
shufps xmm5, xmm5, 245 #5.5
mulps xmm6, xmm5 #5.5
addps xmm7, xmm6 #5.5
movaps xmm9, xmm7 #5.5
shufps xmm9, xmm7, 160 #5.5
mulps xmm9, xmm8 #5.5
xorps xmm8, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm8, xmm8, 177 #5.5
shufps xmm7, xmm7, 245 #5.5
mulps xmm8, xmm7 #5.5
addps xmm9, xmm8 #5.5
movaps xmm11, xmm9 #5.5
shufps xmm11, xmm9, 160 #5.5
mulps xmm11, xmm10 #5.5
xorps xmm10, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm10, xmm10, 177 #5.5
shufps xmm9, xmm9, 245 #5.5
mulps xmm10, xmm9 #5.5
addps xmm11, xmm10 #5.5
movaps xmm13, xmm11 #5.5
shufps xmm13, xmm11, 160 #5.5
mulps xmm13, xmm12 #5.5
xorps xmm12, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm12, xmm12, 177 #5.5
shufps xmm11, xmm11, 245 #5.5
mulps xmm12, xmm11 #5.5
addps xmm13, xmm12 #5.5
movaps xmm1, xmm13 #5.5
shufps xmm1, xmm13, 160 #5.5
mulps xmm1, xmm14 #5.5
xorps xmm14, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm14, xmm14, 177 #5.5
shufps xmm13, xmm13, 245 #5.5
mulps xmm14, xmm13 #5.5
addps xmm1, xmm14 #5.5
cmp rax, 32 #4.3
jb ..B1.2 # Prob 96% #4.3
movaps xmm2, xmm1 #3.19
movhlps xmm2, xmm1 #3.19
movaps xmm0, xmm2 #3.19
shufps xmm0, xmm2, 160 #3.19
mulps xmm0, xmm1 #3.19
xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #3.19
shufps xmm1, xmm1, 177 #3.19
shufps xmm2, xmm2, 245 #3.19
mulps xmm1, xmm2 #3.19
addps xmm0, xmm1 #3.19
ret #6.10
.L_2il0floatpacket.0:
.long 0x3f800000,0x00000000,0x3f800000,0x00000000
.L_2il0floatpacket.1:
.long 0x00000000,0x80000000,0x00000000,0x80000000
Interestingly, clang *can* vectorise
#include <complex.h>
complex float f(complex float x[]) {
complex float p = 1.0;
for (int i = 0; i < 32; i++)
p += x[i]; /* <--- + instead of * */
return p;
}
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170118/812139d5/attachment-0001.html>
More information about the llvm-bugs
mailing list