[LLVMdev] loop vectorizer says Bad stride

Mon Oct 28 16:31:33 PDT 2013

Hi Nadav,

right! The sign-extend was the problem. Hmm.. Is this a bug or a feature?

Frank

On 28/10/13 16:58, Nadav Rotem wrote:
> Frank,
>
> It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ?
>
> Thanks,
> Nadav
>
> On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote:
>
>> Verifying function
>> running passes ...
>> LV: Checking a loop in "bar"
>> LV: Found a loop: L0
>> LV: Found an induction variable.
>> LV: We need to do 0 pointer comparisons.
>> LV: Checking memory dependencies
>> LV: Bad stride - Not an AddRecExpr pointer   %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)
>> LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1)
>> LV: Distance for   store float %11, float* %12 to   store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>)
>> Non-consecutive pointer access
>> LV: We don't need a runtime memory check.
>> LV: Can't vectorize due to memory conflicts
>> LV: Not vectorizing.
>>
>> Here the input IR:
>>
>> define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) {
>> entrypoint:
>>   br label %L0
>>
>> L0:                                               ; preds = %L0, %entrypoint
>>   %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
>>   %1 = add nsw i32 %0, 256
>>   %2 = sext i32 %0 to i64
>>   %3 = getelementptr float* %arg3, i64 %2
>>   %4 = load float* %3, align 4
>>   %5 = sext i32 %1 to i64
>>   %6 = getelementptr float* %arg3, i64 %5
>>   %7 = load float* %6, align 4
>>   %8 = getelementptr float* %arg4, i64 %2
>>   %9 = load float* %8, align 4
>>   %10 = getelementptr float* %arg4, i64 %5
>>   %11 = load float* %10, align 4
>>   %12 = fadd float %11, %7
>>   %13 = fadd float %9, %4
>>   %14 = getelementptr float* %arg2, i64 %2
>>   store float %13, float* %14, align 4
>>   %15 = getelementptr float* %arg2, i64 %5
>>   store float %12, float* %15, align 4
>>   %16 = add nsw i32 %0, 1
>>   %17 = icmp slt i32 %16, %arg1
>>   br i1 %17, label %L0, label %L1
>>
>> L1:                                               ; preds = %L0
>>   ret void
>> }
>>
>> This function is IMO equivalent to
>>
>> void main(int start, int end, float * restrict c, float * restrict a, float * restrict b)
>> {
>>   const int width = 256;
>>   for (int i = start ; i < end ; ++i ) {
>>     c[ i ]         = a[ i ]         + b[ i ];
>>     c[ width + i ] = a[ width + i ] + b[ width + i ];
>>   }
>> }
>>
>> With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop.
>>
>> Here the output from "clang -emit-llvm -S loop.c" which can be parallelized:
>>
>>
>>
>> target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>> target triple = "x86_64-unknown-linux-gnu"
>>
>> ; Function Attrs: nounwind uwtable
>> define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 {
>> entry:
>>   %c.addr = alloca float*, align 8
>>   %a.addr = alloca float*, align 8
>>   %b.addr = alloca float*, align 8
>>   %start.addr = alloca i32, align 4
>>   %end.addr = alloca i32, align 4
>>   %width = alloca i32, align 4
>>   %i = alloca i32, align 4
>>   store float* %c, float** %c.addr, align 8
>>   store float* %a, float** %a.addr, align 8
>>   store float* %b, float** %b.addr, align 8
>>   store i32 %start, i32* %start.addr, align 4
>>   store i32 %end, i32* %end.addr, align 4
>>   store i32 256, i32* %width, align 4
>>   %0 = load i32* %start.addr, align 4
>>   store i32 %0, i32* %i, align 4
>>   br label %for.cond
>>
>> for.cond:                                         ; preds = %for.inc, %entry
>>   %1 = load i32* %i, align 4
>>   %2 = load i32* %end.addr, align 4
>>   %cmp = icmp slt i32 %1, %2
>>   br i1 %cmp, label %for.body, label %for.end
>>
>> for.body:                                         ; preds = %for.cond
>>   %3 = load i32* %i, align 4
>>   %idxprom = sext i32 %3 to i64
>>   %4 = load float** %a.addr, align 8
>>   %arrayidx = getelementptr inbounds float* %4, i64 %idxprom
>>   %5 = load float* %arrayidx, align 4
>>   %6 = load i32* %i, align 4
>>   %idxprom1 = sext i32 %6 to i64
>>   %7 = load float** %b.addr, align 8
>>   %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
>>   %8 = load float* %arrayidx2, align 4
>>   %add = fadd float %5, %8
>>   %9 = load i32* %i, align 4
>>   %idxprom3 = sext i32 %9 to i64
>>   %10 = load float** %c.addr, align 8
>>   %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
>>   store float %add, float* %arrayidx4, align 4
>>   %11 = load i32* %i, align 4
>>   %add5 = add nsw i32 256, %11
>>   %idxprom6 = sext i32 %add5 to i64
>>   %12 = load float** %a.addr, align 8
>>   %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
>>   %13 = load float* %arrayidx7, align 4
>>   %14 = load i32* %i, align 4
>>   %add8 = add nsw i32 256, %14
>>   %idxprom9 = sext i32 %add8 to i64
>>   %15 = load float** %b.addr, align 8
>>   %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
>>   %16 = load float* %arrayidx10, align 4
>>   %add11 = fadd float %13, %16
>>   %17 = load i32* %i, align 4
>>   %add12 = add nsw i32 256, %17
>>   %idxprom13 = sext i32 %add12 to i64
>>   %18 = load float** %c.addr, align 8
>>   %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
>>   store float %add11, float* %arrayidx14, align 4
>>   br label %for.inc
>>
>> for.inc:                                          ; preds = %for.body
>>   %19 = load i32* %i, align 4
>>   %inc = add nsw i32 %19, 1
>>   store i32 %inc, i32* %i, align 4
>>   br label %for.cond
>>
>> for.end:                                          ; preds = %for.cond
>>   ret void
>> }
>>
>> attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
>>
>> !llvm.ident = !{!0}
>>
>> !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}
>>
>>
>> Any ideas why the vectotizer doesn't like my code?
>>
>> Frank
>>
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev