[llvm-dev] Vector evolution?
Alexandre Bique via llvm-dev
llvm-dev at lists.llvm.org
Tue Sep 1 04:07:37 PDT 2020
Hi,
Please consider the following loop:
using v4f32 = float __attribute__((__vector_size__(16)));
void fct6(v4f32 *x)
{
#pragma clang loop vectorize(enable)
for (int i = 0; i < 256; ++i)
x[i] = 7 * x[i];
}
After compiling it with:
clang++ -O3 -march=native -mtune=native \
-Rpass=loop-vectorize,slp-vectorize
-Rpass-missed=loop-vectorize,slp-vectorize
-Rpass-analysis=loop-vectorize,slp-vectorize \
-ffast-math -ffp-model=fast -ffp-exception-behavior=ignore
-ffp-contract=fast -mrecip=all:0 \
-c -o vec.o vec.cc
I get the following codegen:
0000000000000160 <_Z4fct6PDv4_f>:
160: 31 c0 xor %eax,%eax
162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0 # 16b
<_Z4fct6PDv4_f+0xb>
169: 00 00
16b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
170: c5 f8 59 0c 07 vmulps (%rdi,%rax,1),%xmm0,%xmm1
175: c5 f8 29 0c 07 vmovaps %xmm1,(%rdi,%rax,1)
17a: c5 f8 59 4c 07 10 vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1
180: c5 f8 29 4c 07 10 vmovaps %xmm1,0x10(%rdi,%rax,1)
186: c5 f8 59 4c 07 20 vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1
18c: c5 f8 29 4c 07 20 vmovaps %xmm1,0x20(%rdi,%rax,1)
192: c5 f8 59 4c 07 30 vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1
198: c5 f8 29 4c 07 30 vmovaps %xmm1,0x30(%rdi,%rax,1)
19e: c5 f8 59 4c 07 40 vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1
1a4: c5 f8 29 4c 07 40 vmovaps %xmm1,0x40(%rdi,%rax,1)
1aa: c5 f8 59 4c 07 50 vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1
1b0: c5 f8 29 4c 07 50 vmovaps %xmm1,0x50(%rdi,%rax,1)
1b6: c5 f8 59 4c 07 60 vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1
1bc: c5 f8 29 4c 07 60 vmovaps %xmm1,0x60(%rdi,%rax,1)
1c2: c5 f8 59 4c 07 70 vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1
1c8: c5 f8 29 4c 07 70 vmovaps %xmm1,0x70(%rdi,%rax,1)
1ce: 48 83 e8 80 sub $0xffffffffffffff80,%rax
1d2: 48 3d 00 10 00 00 cmp $0x1000,%rax
1d8: 75 96 jne 170 <_Z4fct6PDv4_f+0x10>
1da: c3 retq
My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So
should the compiler understand the loop and upgrade the vector width?
On the other hand if I do the following loop:
void fct7(float *x)
{
#pragma clang loop vectorize(enable)
for (int i = 0; i < 4 * 256; ++i)
x[i] = 7 * x[i];
}
It compiles it to:
00000000000001e0 <_Z4fct7Pf>:
1e0: 31 c0 xor %eax,%eax
1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0 # 1eb
<_Z4fct7Pf+0xb>
1e9: 00 00
1eb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
1f0: c5 fc 59 0c 87 vmulps (%rdi,%rax,4),%ymm0,%ymm1
1f5: c5 fc 59 54 87 20 vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2
1fb: c5 fc 59 5c 87 40 vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3
201: c5 fc 59 64 87 60 vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4
207: c5 fc 11 0c 87 vmovups %ymm1,(%rdi,%rax,4)
20c: c5 fc 11 54 87 20 vmovups %ymm2,0x20(%rdi,%rax,4)
212: c5 fc 11 5c 87 40 vmovups %ymm3,0x40(%rdi,%rax,4)
218: c5 fc 11 64 87 60 vmovups %ymm4,0x60(%rdi,%rax,4)
21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1
225: 00 00
227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2
22e: 00 00
230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3
237: 00 00
239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4
240: 00 00
242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4)
249: 00 00
24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4)
252: 00 00
254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4)
25b: 00 00
25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4)
264: 00 00
266: 48 83 c0 40 add $0x40,%rax
26a: 48 3d 00 04 00 00 cmp $0x400,%rax
270: 0f 85 7a ff ff ff jne 1f0 <_Z4fct7Pf+0x10>
276: c5 f8 77 vzeroupper
279: c3 retq
Which is using wider vectors.
What do you think? Why not transform the fct6's loop to use wider registers?
Regards,
--
Alexandre Bique
More information about the llvm-dev
mailing list