[cfe-dev] LLVM/Clang optimization issue -- optimizer fails to discover common loop variable

mats petersson via cfe-dev cfe-dev at lists.llvm.org
Thu Jun 9 03:00:25 PDT 2016

Being technical on the "lack of optimisation" here, it is not that LLVM
doesn't identify the common loop variable, but the fact that it (probably
misguidedly) removes the loop variable and uses multiple pointers instead.

You WOULD want:

    for(i = 0; i < size; i++)
       s[i] = t[i];

to be converted to:

    stmp = s;
    ttemp = t;
    send = s+size;
    while(stmp < send)
       s++ = t++;

right? So the compiler should recognise these cases, and do the conversion,
but only when it's actually "better".


On 9 June 2016 at 10:02, Wenzel Jakob via cfe-dev <cfe-dev at lists.llvm.org>

> Apologies, there was a typo in my last message.
> "The following output was generated by LLVM” in the third paragraph should
> have read "The following output was generated by GCC 5.3.0”.
> -Wenzel
> On 09 Jun 2016, at 09:15, Wenzel Jakob <wenzel.jakob at epfl.ch> wrote:
> Hi,
> I’m having trouble getting LLVM/Clang to generate high quality code for a
> tight loop involving AVX intrinsics.
> Consider the following simple function, which computes a sum of a bunch of
> buffers.
> #include <immintrin.h>
> void sum(size_t n,
>         __m256* __restrict__ a, __m256* __restrict__ b,
>         __m256* __restrict__ c, __m256* __restrict__ d,
>         __m256* __restrict__ e, __m256* __restrict__ f,
>         __m256* __restrict__ g) {
>     for (size_t i = 0; i < n; ++i)
>         a[i] = _mm256_add_ps(
>             _mm256_add_ps(_mm256_add_ps(b[i], c[i]),
>                 _mm256_add_ps(d[i], e[i])),
>                 _mm256_add_ps(f[i], g[i]));
> }
> This is the main loop body resulting from the above expression (compiled
> with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops). Note the large
> number of “addq” instructions!
> The compiler also seems to assume that the __m256 entries are unaligned,
> but that is another issue.
> LBB0_2:
> vmovups (%rdx), %ymm0
> vaddps (%rcx), %ymm0, %ymm0
> vmovups (%r8), %ymm1
> vaddps (%r9), %ymm1, %ymm1
> vaddps %ymm1, %ymm0, %ymm0
> vmovups (%rax), %ymm1
> vaddps (%r10), %ymm1, %ymm1
> vaddps %ymm1, %ymm0, %ymm0
> vmovups %ymm0, (%rsi)
> addq $32, %rdx
> addq $32, %rcx
> addq $32, %r8
> addq $32, %r9
> addq $32, %rax
> addq $32, %r10
> addq $32, %rsi
> decq %rdi
> jne LBB0_2
> The following output was generated by LLVM. It identifies the common
> counter variable and just stores the buffer offsets in registers.
> L5:
> vmovaps (%r9,%rax), %ymm1
> vmovaps (%rcx,%rax), %ymm0
> vaddps (%r8,%rax), %ymm1, %ymm2
> vaddps (%rdx,%rax), %ymm0, %ymm0
> vaddps %ymm0, %ymm2, %ymm1
> vmovaps (%r11,%rax), %ymm0
> vaddps (%rbx,%rax), %ymm0, %ymm0
> vaddps %ymm0, %ymm1, %ymm0
> vmovaps %ymm0, (%rsi,%rax)
> addq $32, %rax
> cmpq %rax, %r10
> jne L5
> Is there something that could be done to LLVM to generate better code in
> such cases?
> For reference, this is the associated LLVM IR:
> ; Function Attrs: nounwind ssp uwtable
> define void @_Z3summPDv8_fS0_S0_S0_S0_S0_S0_(i64 %n, <8 x float>* noalias
> nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>*
> noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d,
> <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture
> readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
>   %1 = icmp eq i64 %n, 0
>   br i1 %1, label %._crit_edge, label %.lr.ph
> ._crit_edge:                                      ; preds = %.lr.ph, %0
>   ret void
> .lr.ph:                                           ; preds = %0, %.lr.ph
>   %i.01 = phi i64 [ %20, %.lr.ph ], [ 0, %0 ]
>   %2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
>   %3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
>   %4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
>   %5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
>   %6 = fadd <8 x float> %3, %5
>   %7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
>   %8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
>   %9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
>   %10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
>   %11 = fadd <8 x float> %8, %10
>   %12 = fadd <8 x float> %6, %11
>   %13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
>   %14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
>   %15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
>   %16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
>   %17 = fadd <8 x float> %14, %16
>   %18 = fadd <8 x float> %12, %17
>   %19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
>   store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
>   %20 = add nuw i64 %i.01, 1
>   %exitcond = icmp eq i64 %20, %n
>   br i1 %exitcond, label %._crit_edge, label %.lr.ph
> }
> Thank you and best regards,
> Wenzel
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20160609/2d99f1cd/attachment.html>

More information about the cfe-dev mailing list