[cfe-dev] LLVM/Clang optimization issue -- optimizer fails to discover common loop variable

Thu Jun 9 02:02:04 PDT 2016

Apologies, there was a typo in my last message.

"The following output was generated by LLVM” in the third paragraph should have read "The following output was generated by GCC 5.3.0”.

-Wenzel

> On 09 Jun 2016, at 09:15, Wenzel Jakob <wenzel.jakob at epfl.ch> wrote:
> 
> Hi,
> 
> I’m having trouble getting LLVM/Clang to generate high quality code for a tight loop involving AVX intrinsics.
> Consider the following simple function, which computes a sum of a bunch of buffers.
> 
> #include <immintrin.h>
> 
> void sum(size_t n, 
>         __m256* __restrict__ a, __m256* __restrict__ b,
>         __m256* __restrict__ c, __m256* __restrict__ d,
>         __m256* __restrict__ e, __m256* __restrict__ f,
>         __m256* __restrict__ g) {
> 
>     for (size_t i = 0; i < n; ++i)
>         a[i] = _mm256_add_ps(
>             _mm256_add_ps(_mm256_add_ps(b[i], c[i]),
>                 _mm256_add_ps(d[i], e[i])),
>                 _mm256_add_ps(f[i], g[i]));
> }
> 
> This is the main loop body resulting from the above expression (compiled with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops). Note the large number of “addq” instructions!
> The compiler also seems to assume that the __m256 entries are unaligned, but that is another issue.
> 
> LBB0_2:
> 	vmovups	(%rdx), %ymm0
> 	vaddps	(%rcx), %ymm0, %ymm0
> 	vmovups	(%r8), %ymm1
> 	vaddps	(%r9), %ymm1, %ymm1
> 	vaddps	%ymm1, %ymm0, %ymm0
> 	vmovups	(%rax), %ymm1
> 	vaddps	(%r10), %ymm1, %ymm1
> 	vaddps	%ymm1, %ymm0, %ymm0
> 	vmovups	%ymm0, (%rsi)
> 	addq	$32, %rdx
> 	addq	$32, %rcx
> 	addq	$32, %r8
> 	addq	$32, %r9
> 	addq	$32, %rax
> 	addq	$32, %r10
> 	addq	$32, %rsi
> 	decq	%rdi
> 	jne	LBB0_2
> 
> 
> The following output was generated by LLVM. It identifies the common counter variable and just stores the buffer offsets in registers.
> 
> L5:
> 	vmovaps	(%r9,%rax), %ymm1
> 	vmovaps	(%rcx,%rax), %ymm0
> 	vaddps	(%r8,%rax), %ymm1, %ymm2
> 	vaddps	(%rdx,%rax), %ymm0, %ymm0
> 	vaddps	%ymm0, %ymm2, %ymm1
> 	vmovaps	(%r11,%rax), %ymm0
> 	vaddps	(%rbx,%rax), %ymm0, %ymm0
> 	vaddps	%ymm0, %ymm1, %ymm0
> 	vmovaps	%ymm0, (%rsi,%rax)
> 	addq	$32, %rax
> 	cmpq	%rax, %r10
> 	jne	L5
> 
> Is there something that could be done to LLVM to generate better code in such cases?
> 
> For reference, this is the associated LLVM IR:
> 
> ; Function Attrs: nounwind ssp uwtable
> define void @_Z3summPDv8_fS0_S0_S0_S0_S0_S0_(i64 %n, <8 x float>* noalias nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>* noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d, <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
>   %1 = icmp eq i64 %n, 0
>   br i1 %1, label %._crit_edge, label %.lr.ph
> 
> ._crit_edge:                                      ; preds = %.lr.ph, %0
>   ret void
> 
> .lr.ph:                                           ; preds = %0, %.lr.ph
>   %i.01 = phi i64 [ %20, %.lr.ph ], [ 0, %0 ]
>   %2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
>   %3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
>   %4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
>   %5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
>   %6 = fadd <8 x float> %3, %5
>   %7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
>   %8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
>   %9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
>   %10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
>   %11 = fadd <8 x float> %8, %10
>   %12 = fadd <8 x float> %6, %11
>   %13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
>   %14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
>   %15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
>   %16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
>   %17 = fadd <8 x float> %14, %16
>   %18 = fadd <8 x float> %12, %17
>   %19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
>   store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
>   %20 = add nuw i64 %i.01, 1
>   %exitcond = icmp eq i64 %20, %n
>   br i1 %exitcond, label %._crit_edge, label %.lr.ph
> }
> 
> Thank you and best regards,
> Wenzel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20160609/be526225/attachment.html>