[LLVMdev] loop vectorizer and storing to uniform addresses

Fri Nov 8 08:29:09 PST 2013

You did not run mem2reg before running the vectorizer.  All of your variables are still in allocas. You need to run the standard llvm optimization pipe (or an approximation of it) before running the vectorizer. 

On Nov 8, 2013, at 5:41 AM, Frank Winter <fwinter at jlab.org> wrote:

> I changed the input C to using a 64 bit type for the loop index (this eliminates 'sext' instructions in the IR)
> 
> Here the IR produced with clang -O0
> 
> 
> define float @foo(i64 %start, i64 %end, float* %A) #0 {
> entry:
>   %start.addr = alloca i64, align 8
>   %end.addr = alloca i64, align 8
>   %A.addr = alloca float*, align 8
>   %sum = alloca [4 x float], align 16
>   %i = alloca i64, align 8
>   %q = alloca i64, align 8
>   store i64 %start, i64* %start.addr, align 8
>   store i64 %end, i64* %end.addr, align 8
>   store float* %A, float** %A.addr, align 8
>   %0 = bitcast [4 x float]* %sum to i8*
>   call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16, i32 16, i1 false)
>   %1 = load i64* %start.addr, align 8
>   store i64 %1, i64* %i, align 8
>   br label %for.cond
> 
> for.cond:                                         ; preds = %for.inc6, %entry
>   %2 = load i64* %i, align 8
>   %3 = load i64* %end.addr, align 8
>   %cmp = icmp slt i64 %2, %3
>   br i1 %cmp, label %for.body, label %for.end8
> 
> for.body:                                         ; preds = %for.cond
>   store i64 0, i64* %q, align 8
>   br label %for.cond1
> 
> for.cond1:                                        ; preds = %for.inc, %for.body
>   %4 = load i64* %q, align 8
>   %cmp2 = icmp slt i64 %4, 4
>   br i1 %cmp2, label %for.body3, label %for.end
> 
> for.body3:                                        ; preds = %for.cond1
>   %5 = load i64* %i, align 8
>   %mul = mul nsw i64 %5, 4
>   %6 = load i64* %q, align 8
>   %add = add nsw i64 %mul, %6
>   %7 = load float** %A.addr, align 8
>   %arrayidx = getelementptr inbounds float* %7, i64 %add
>   %8 = load float* %arrayidx, align 4
>   %9 = load i64* %q, align 8
>   %arrayidx4 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 %9
>   %10 = load float* %arrayidx4, align 4
>   %add5 = fadd float %10, %8
>   store float %add5, float* %arrayidx4, align 4
>   br label %for.inc
> 
> for.inc:                                          ; preds = %for.body3
>   %11 = load i64* %q, align 8
>   %inc = add nsw i64 %11, 1
>   store i64 %inc, i64* %q, align 8
>   br label %for.cond1
> 
> for.end:                                          ; preds = %for.cond1
>   br label %for.inc6
> 
> for.inc6:                                         ; preds = %for.end
>   %12 = load i64* %i, align 8
>   %inc7 = add nsw i64 %12, 1
>   store i64 %inc7, i64* %i, align 8
>   br label %for.cond
> 
> for.end8:                                         ; preds = %for.cond
>   %arrayidx9 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 0
>   %13 = load float* %arrayidx9, align 4
>   %arrayidx10 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 1
>   %14 = load float* %arrayidx10, align 4
>   %add11 = fadd float %13, %14
>   %arrayidx12 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 2
>   %15 = load float* %arrayidx12, align 4
>   %add13 = fadd float %add11, %15
>   %arrayidx14 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 3
>   %16 = load float* %arrayidx14, align 4
>   %add15 = fadd float %add13, %16
>   ret float %add15
> }
> 
> 
> 
> Thus, the inner loop is not unrolled.
> 
> opt -basicaa -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count=4 -S sum.ll
> 
> LV: Checking a loop in "foo"
> LV: Found a loop: for.cond1
> LV: SCEV could not compute the loop exit count.
> LV: Not vectorizing.
> 
> opt -basicaa -gvn -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count=4 -S sum.ll
> 
> LV: Checking a loop in "foo"
> LV: Found a loop: for.cond1
> LV: Found an induction variable.
> LV: We don't allow storing to uniform addresses
> LV: Can't vectorize due to memory conflicts
> LV: Not vectorizing.
> 
> 
> Frank
> 
> 
> 
> On 08/11/13 02:49, Renato Golin wrote:
>> On 7 November 2013 17:18, Frank Winter <fwinter at jlab.org> wrote:
>> LV: We don't allow storing to uniform addresses
>> 
>> This is triggering because it didn't recognize as a reduction variable during the canVectorizeInstrs() but did recognize that sum[q] is loop invariant in canVectorizeMemory().
>> 
>> I'm guessing the nested loop was unrolled because of the low trip-count, and removed, so it ended up as:
>> 
>> float foo( int start , int end , float * A  )
>> {
>>   float sum[4] = {0.,0.,0.,0.};
>>   for (int i = start ; i < end ; ++i ) {
>>     sum[0] += A[i*4+0];
>>     sum[1] += A[i*4+1];
>>     sum[2] += A[i*4+2];
>>     sum[3] += A[i*4+3];
>>   }
>>   return sum[0]+sum[1]+sum[2]+sum[3];
>> }
>> 
>> but, for some reason, sum[q] wasn't recognized as a reduction variable, maybe because it was an array of reduction variables?
>> 
>> Having the IR would certainly help...
>> 
>> cheers,
>> --renato
> 
> 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20131108/e7a3a192/attachment.html>