[llvm-dev] Vector evolution?

Please consider the following loop:

using v4f32 = float __attribute__((__vector_size__(16)));

void fct6(v4f32 *x)
#pragma clang loop vectorize(enable)
  for (int i = 0; i < 256; ++i)
    x[i] = 7 * x[i];

After compiling it with:

clang++ -O3 -march=native -mtune=native \
-Rpass-analysis=loop-vectorize,slp-vectorize \
-ffast-math -ffp-model=fast -ffp-exception-behavior=ignore
-ffp-contract=fast -mrecip=all:0 \
-c -o vec.o vec.cc

I get the following codegen:

0000000000000160 <_Z4fct6PDv4_f>:
 160: 31 c0                xor    %eax,%eax
 162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0        # 16b
 169: 00 00
 16b: 0f 1f 44 00 00        nopl   0x0(%rax,%rax,1)
 170: c5 f8 59 0c 07        vmulps (%rdi,%rax,1),%xmm0,%xmm1
 175: c5 f8 29 0c 07        vmovaps %xmm1,(%rdi,%rax,1)
 17a: c5 f8 59 4c 07 10    vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1
 180: c5 f8 29 4c 07 10    vmovaps %xmm1,0x10(%rdi,%rax,1)
 186: c5 f8 59 4c 07 20    vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1
 18c: c5 f8 29 4c 07 20    vmovaps %xmm1,0x20(%rdi,%rax,1)
 192: c5 f8 59 4c 07 30    vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1
 198: c5 f8 29 4c 07 30    vmovaps %xmm1,0x30(%rdi,%rax,1)
 19e: c5 f8 59 4c 07 40    vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1
 1a4: c5 f8 29 4c 07 40    vmovaps %xmm1,0x40(%rdi,%rax,1)
 1aa: c5 f8 59 4c 07 50    vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1
 1b0: c5 f8 29 4c 07 50    vmovaps %xmm1,0x50(%rdi,%rax,1)
 1b6: c5 f8 59 4c 07 60    vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1
 1bc: c5 f8 29 4c 07 60    vmovaps %xmm1,0x60(%rdi,%rax,1)
 1c2: c5 f8 59 4c 07 70    vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1
 1c8: c5 f8 29 4c 07 70    vmovaps %xmm1,0x70(%rdi,%rax,1)
 1ce: 48 83 e8 80          sub    $0xffffffffffffff80,%rax
 1d2: 48 3d 00 10 00 00    cmp    $0x1000,%rax
 1d8: 75 96                jne    170 <_Z4fct6PDv4_f+0x10>
 1da: c3                    retq

My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So
should the compiler understand the loop and upgrade the vector width?

On the other hand if I do the following loop:

void fct7(float *x)
#pragma clang loop vectorize(enable)
  for (int i = 0; i < 4 * 256; ++i)
    x[i] = 7 * x[i];

It compiles it to:

00000000000001e0 <_Z4fct7Pf>:
 1e0: 31 c0                xor    %eax,%eax
 1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0        # 1eb
 1e9: 00 00
 1eb: 0f 1f 44 00 00        nopl   0x0(%rax,%rax,1)
 1f0: c5 fc 59 0c 87        vmulps (%rdi,%rax,4),%ymm0,%ymm1
 1f5: c5 fc 59 54 87 20    vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2
 1fb: c5 fc 59 5c 87 40    vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3
 201: c5 fc 59 64 87 60    vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4
 207: c5 fc 11 0c 87        vmovups %ymm1,(%rdi,%rax,4)
 20c: c5 fc 11 54 87 20    vmovups %ymm2,0x20(%rdi,%rax,4)
 212: c5 fc 11 5c 87 40    vmovups %ymm3,0x40(%rdi,%rax,4)
 218: c5 fc 11 64 87 60    vmovups %ymm4,0x60(%rdi,%rax,4)
 21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1
 225: 00 00
 227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2
 22e: 00 00
 230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3
 237: 00 00
 239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4
 240: 00 00
 242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4)
 249: 00 00
 24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4)
 252: 00 00
 254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4)
 25b: 00 00
 25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4)
 264: 00 00
 266: 48 83 c0 40          add    $0x40,%rax
 26a: 48 3d 00 04 00 00    cmp    $0x400,%rax
 270: 0f 85 7a ff ff ff    jne    1f0 <_Z4fct7Pf+0x10>
 276: c5 f8 77              vzeroupper
 279: c3                    retq

Which is using wider vectors.

What do you think? Why not transform the fct6's loop to use wider registers?

Alexandre Bique

