[cfe-dev] LLVM/Clang optimization issue -- optimizer fails to discover common loop variable

Thu Jun 9 03:51:37 PDT 2016

Agreed, it looks like this result is the result of a loop variable optimization which made the resulting code more complex. So what’s the best way to track this down?

Thanks,
Wenzel

> On 09 Jun 2016, at 12:00, mats petersson <mats at planetcatfish.com> wrote:
> 
> Being technical on the "lack of optimisation" here, it is not that LLVM doesn't identify the common loop variable, but the fact that it (probably misguidedly) removes the loop variable and uses multiple pointers instead.
> 
> You WOULD want:
> 
>     for(i = 0; i < size; i++)
>     {
>        s[i] = t[i];
>     }
> 
> to be converted to:
> 
>     stmp = s;
>     ttemp = t;
>     send = s+size;
>     while(stmp < send)
>     {
>        s++ = t++;
>     }
> 
> right? So the compiler should recognise these cases, and do the conversion, but only when it's actually "better".
> 
> --
> Mats
> 
> On 9 June 2016 at 10:02, Wenzel Jakob via cfe-dev <cfe-dev at lists.llvm.org <mailto:cfe-dev at lists.llvm.org>> wrote:
> Apologies, there was a typo in my last message.
> 
> "The following output was generated by LLVM” in the third paragraph should have read "The following output was generated by GCC 5.3.0”.
> 
> -Wenzel
> 
>> On 09 Jun 2016, at 09:15, Wenzel Jakob <wenzel.jakob at epfl.ch <mailto:wenzel.jakob at epfl.ch>> wrote:
>> 
>> Hi,
>> 
>> I’m having trouble getting LLVM/Clang to generate high quality code for a tight loop involving AVX intrinsics.
>> Consider the following simple function, which computes a sum of a bunch of buffers.
>> 
>> #include <immintrin.h>
>> 
>> void sum(size_t n, 
>>         __m256* __restrict__ a, __m256* __restrict__ b,
>>         __m256* __restrict__ c, __m256* __restrict__ d,
>>         __m256* __restrict__ e, __m256* __restrict__ f,
>>         __m256* __restrict__ g) {
>> 
>>     for (size_t i = 0; i < n; ++i)
>>         a[i] = _mm256_add_ps(
>>             _mm256_add_ps(_mm256_add_ps(b[i], c[i]),
>>                 _mm256_add_ps(d[i], e[i])),
>>                 _mm256_add_ps(f[i], g[i]));
>> }
>> 
>> This is the main loop body resulting from the above expression (compiled with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops). Note the large number of “addq” instructions!
>> The compiler also seems to assume that the __m256 entries are unaligned, but that is another issue.
>> 
>> LBB0_2:
>> 	vmovups	(%rdx), %ymm0
>> 	vaddps	(%rcx), %ymm0, %ymm0
>> 	vmovups	(%r8), %ymm1
>> 	vaddps	(%r9), %ymm1, %ymm1
>> 	vaddps	%ymm1, %ymm0, %ymm0
>> 	vmovups	(%rax), %ymm1
>> 	vaddps	(%r10), %ymm1, %ymm1
>> 	vaddps	%ymm1, %ymm0, %ymm0
>> 	vmovups	%ymm0, (%rsi)
>> 	addq	$32, %rdx
>> 	addq	$32, %rcx
>> 	addq	$32, %r8
>> 	addq	$32, %r9
>> 	addq	$32, %rax
>> 	addq	$32, %r10
>> 	addq	$32, %rsi
>> 	decq	%rdi
>> 	jne	LBB0_2
>> 
>> 
>> The following output was generated by LLVM. It identifies the common counter variable and just stores the buffer offsets in registers.
>> 
>> L5:
>> 	vmovaps	(%r9,%rax), %ymm1
>> 	vmovaps	(%rcx,%rax), %ymm0
>> 	vaddps	(%r8,%rax), %ymm1, %ymm2
>> 	vaddps	(%rdx,%rax), %ymm0, %ymm0
>> 	vaddps	%ymm0, %ymm2, %ymm1
>> 	vmovaps	(%r11,%rax), %ymm0
>> 	vaddps	(%rbx,%rax), %ymm0, %ymm0
>> 	vaddps	%ymm0, %ymm1, %ymm0
>> 	vmovaps	%ymm0, (%rsi,%rax)
>> 	addq	$32, %rax
>> 	cmpq	%rax, %r10
>> 	jne	L5
>> 
>> Is there something that could be done to LLVM to generate better code in such cases?
>> 
>> For reference, this is the associated LLVM IR:
>> 
>> ; Function Attrs: nounwind ssp uwtable
>> define void @_Z3summPDv8_fS0_S0_S0_S0_S0_S0_(i64 %n, <8 x float>* noalias nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>* noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d, <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
>>   %1 = icmp eq i64 %n, 0
>>   br i1 %1, label %._crit_edge, label %.lr.ph <http://lr.ph/>
>> 
>> ._crit_edge:                                      ; preds = %.lr.ph <http://lr.ph/>, %0
>>   ret void
>> 
>> .lr.ph <http://lr.ph/>:                                           ; preds = %0, %.lr.ph <http://lr.ph/>
>>   %i.01 = phi i64 [ %20, %.lr.ph <http://lr.ph/> ], [ 0, %0 ]
>>   %2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
>>   %3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
>>   %4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
>>   %5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
>>   %6 = fadd <8 x float> %3, %5
>>   %7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
>>   %8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
>>   %9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
>>   %10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
>>   %11 = fadd <8 x float> %8, %10
>>   %12 = fadd <8 x float> %6, %11
>>   %13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
>>   %14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
>>   %15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
>>   %16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
>>   %17 = fadd <8 x float> %14, %16
>>   %18 = fadd <8 x float> %12, %17
>>   %19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
>>   store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
>>   %20 = add nuw i64 %i.01, 1
>>   %exitcond = icmp eq i64 %20, %n
>>   br i1 %exitcond, label %._crit_edge, label %.lr.ph <http://lr.ph/>
>> }
>> 
>> Thank you and best regards,
>> Wenzel
> 
> 
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at lists.llvm.org <mailto:cfe-dev at lists.llvm.org>
> http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev <http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev>
> 
> 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20160609/f751bc8e/attachment.html>