[cfe-dev] LLVM/Clang optimization issue -- optimizer fails to discover common loop variable
Wenzel Jakob via cfe-dev
cfe-dev at lists.llvm.org
Thu Jun 9 02:02:04 PDT 2016
Apologies, there was a typo in my last message.
"The following output was generated by LLVM” in the third paragraph should have read "The following output was generated by GCC 5.3.0”.
-Wenzel
> On 09 Jun 2016, at 09:15, Wenzel Jakob <wenzel.jakob at epfl.ch> wrote:
>
> Hi,
>
> I’m having trouble getting LLVM/Clang to generate high quality code for a tight loop involving AVX intrinsics.
> Consider the following simple function, which computes a sum of a bunch of buffers.
>
> #include <immintrin.h>
>
> void sum(size_t n,
> __m256* __restrict__ a, __m256* __restrict__ b,
> __m256* __restrict__ c, __m256* __restrict__ d,
> __m256* __restrict__ e, __m256* __restrict__ f,
> __m256* __restrict__ g) {
>
> for (size_t i = 0; i < n; ++i)
> a[i] = _mm256_add_ps(
> _mm256_add_ps(_mm256_add_ps(b[i], c[i]),
> _mm256_add_ps(d[i], e[i])),
> _mm256_add_ps(f[i], g[i]));
> }
>
> This is the main loop body resulting from the above expression (compiled with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops). Note the large number of “addq” instructions!
> The compiler also seems to assume that the __m256 entries are unaligned, but that is another issue.
>
> LBB0_2:
> vmovups (%rdx), %ymm0
> vaddps (%rcx), %ymm0, %ymm0
> vmovups (%r8), %ymm1
> vaddps (%r9), %ymm1, %ymm1
> vaddps %ymm1, %ymm0, %ymm0
> vmovups (%rax), %ymm1
> vaddps (%r10), %ymm1, %ymm1
> vaddps %ymm1, %ymm0, %ymm0
> vmovups %ymm0, (%rsi)
> addq $32, %rdx
> addq $32, %rcx
> addq $32, %r8
> addq $32, %r9
> addq $32, %rax
> addq $32, %r10
> addq $32, %rsi
> decq %rdi
> jne LBB0_2
>
>
> The following output was generated by LLVM. It identifies the common counter variable and just stores the buffer offsets in registers.
>
> L5:
> vmovaps (%r9,%rax), %ymm1
> vmovaps (%rcx,%rax), %ymm0
> vaddps (%r8,%rax), %ymm1, %ymm2
> vaddps (%rdx,%rax), %ymm0, %ymm0
> vaddps %ymm0, %ymm2, %ymm1
> vmovaps (%r11,%rax), %ymm0
> vaddps (%rbx,%rax), %ymm0, %ymm0
> vaddps %ymm0, %ymm1, %ymm0
> vmovaps %ymm0, (%rsi,%rax)
> addq $32, %rax
> cmpq %rax, %r10
> jne L5
>
> Is there something that could be done to LLVM to generate better code in such cases?
>
> For reference, this is the associated LLVM IR:
>
> ; Function Attrs: nounwind ssp uwtable
> define void @_Z3summPDv8_fS0_S0_S0_S0_S0_S0_(i64 %n, <8 x float>* noalias nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>* noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d, <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
> %1 = icmp eq i64 %n, 0
> br i1 %1, label %._crit_edge, label %.lr.ph
>
> ._crit_edge: ; preds = %.lr.ph, %0
> ret void
>
> .lr.ph: ; preds = %0, %.lr.ph
> %i.01 = phi i64 [ %20, %.lr.ph ], [ 0, %0 ]
> %2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
> %3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
> %4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
> %5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
> %6 = fadd <8 x float> %3, %5
> %7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
> %8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
> %9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
> %10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
> %11 = fadd <8 x float> %8, %10
> %12 = fadd <8 x float> %6, %11
> %13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
> %14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
> %15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
> %16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
> %17 = fadd <8 x float> %14, %16
> %18 = fadd <8 x float> %12, %17
> %19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
> store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
> %20 = add nuw i64 %i.01, 1
> %exitcond = icmp eq i64 %20, %n
> br i1 %exitcond, label %._crit_edge, label %.lr.ph
> }
>
> Thank you and best regards,
> Wenzel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20160609/be526225/attachment.html>
More information about the cfe-dev
mailing list