[cfe-dev] LLVM/Clang optimization issue -- optimizer fails to discover common loop variable

Thu Jun 9 00:26:18 PDT 2016

Hi,

I’m having trouble getting LLVM/Clang to generate high quality code for a tight loop involving AVX intrinsics.
Consider the following simple function, which computes a sum of a bunch of buffers.

#include <immintrin.h>

void sum(size_t n, 
        __m256* __restrict__ a, __m256* __restrict__ b,
        __m256* __restrict__ c, __m256* __restrict__ d,
        __m256* __restrict__ e, __m256* __restrict__ f,
        __m256* __restrict__ g) {

    for (size_t i = 0; i < n; ++i)
        a[i] = _mm256_add_ps(
            _mm256_add_ps(_mm256_add_ps(b[i], c[i]),
                _mm256_add_ps(d[i], e[i])),
                _mm256_add_ps(f[i], g[i]));
}

This is the main loop body resulting from the above expression (compiled with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops) with LLVM version 7.3.0 (clang-703.0.31). Note the large number of “addq” instructions!
The compiler also seems to assume that the __m256 entries are unaligned, but that is another issue.

LBB0_2:
	vmovups	(%rdx), %ymm0
	vaddps	(%rcx), %ymm0, %ymm0
	vmovups	(%r8), %ymm1
	vaddps	(%r9), %ymm1, %ymm1
	vaddps	%ymm1, %ymm0, %ymm0
	vmovups	(%rax), %ymm1
	vaddps	(%r10), %ymm1, %ymm1
	vaddps	%ymm1, %ymm0, %ymm0
	vmovups	%ymm0, (%rsi)
	addq	$32, %rdx
	addq	$32, %rcx
	addq	$32, %r8
	addq	$32, %r9
	addq	$32, %rax
	addq	$32, %r10
	addq	$32, %rsi
	decq	%rdi
	jne	LBB0_2

The following output was generated by LLVM. It identifies the common counter variable and just stores the buffer offsets in registers.

L5:
	vmovaps	(%r9,%rax), %ymm1
	vmovaps	(%rcx,%rax), %ymm0
	vaddps	(%r8,%rax), %ymm1, %ymm2
	vaddps	(%rdx,%rax), %ymm0, %ymm0
	vaddps	%ymm0, %ymm2, %ymm1
	vmovaps	(%r11,%rax), %ymm0
	vaddps	(%rbx,%rax), %ymm0, %ymm0
	vaddps	%ymm0, %ymm1, %ymm0
	vmovaps	%ymm0, (%rsi,%rax)
	addq	$32, %rax
	cmpq	%rax, %r10
	jne	L5

Is there something that could be done to LLVM to generate better code in such cases?

For reference, this is the associated LLVM IR:

; Function Attrs: nounwind ssp uwtable
define void @_Z3summPDv8_fS0_S0_S0_S0_S0_S0_(i64 %n, <8 x float>* noalias nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>* noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d, <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
  %1 = icmp eq i64 %n, 0
  br i1 %1, label %._crit_edge, label %.lr.ph

._crit_edge:                                      ; preds = %.lr.ph, %0
  ret void

.lr.ph:                                           ; preds = %0, %.lr.ph
  %i.01 = phi i64 [ %20, %.lr.ph ], [ 0, %0 ]
  %2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
  %3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
  %4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
  %5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
  %6 = fadd <8 x float> %3, %5
  %7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
  %8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
  %9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
  %10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
  %11 = fadd <8 x float> %8, %10
  %12 = fadd <8 x float> %6, %11
  %13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
  %14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
  %15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
  %16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
  %17 = fadd <8 x float> %14, %16
  %18 = fadd <8 x float> %12, %17
  %19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
  store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
  %20 = add nuw i64 %i.01, 1
  %exitcond = icmp eq i64 %20, %n
  br i1 %exitcond, label %._crit_edge, label %.lr.ph
}

Thank you and best regards,
Wenzel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20160609/c86f8ce5/attachment.html>