[LLVMdev] loop vectorizer says Bad stride
Frank Winter
fwinter at jlab.org
Mon Oct 28 13:38:36 PDT 2013
Verifying function
running passes ...
LV: Checking a loop in "bar"
LV: Found a loop: L0
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: Checking memory dependencies
LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float*
%arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to
i64)) + %arg2)
LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink
Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
%arg2)(Induction step: 1)
LV: Distance for store float %11, float* %12 to store float %10,
float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4
* (sext i32 %arg0 to i64)),+,-4}<%L0>)
Non-consecutive pointer access
LV: We don't need a runtime memory check.
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.
Here the input IR:
define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float*
noalias %arg3, float* noalias %arg4) {
entrypoint:
br label %L0
L0: ; preds = %L0, %entrypoint
%0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
%1 = add nsw i32 %0, 256
%2 = sext i32 %0 to i64
%3 = getelementptr float* %arg3, i64 %2
%4 = load float* %3, align 4
%5 = sext i32 %1 to i64
%6 = getelementptr float* %arg3, i64 %5
%7 = load float* %6, align 4
%8 = getelementptr float* %arg4, i64 %2
%9 = load float* %8, align 4
%10 = getelementptr float* %arg4, i64 %5
%11 = load float* %10, align 4
%12 = fadd float %11, %7
%13 = fadd float %9, %4
%14 = getelementptr float* %arg2, i64 %2
store float %13, float* %14, align 4
%15 = getelementptr float* %arg2, i64 %5
store float %12, float* %15, align 4
%16 = add nsw i32 %0, 1
%17 = icmp slt i32 %16, %arg1
br i1 %17, label %L0, label %L1
L1: ; preds = %L0
ret void
}
This function is IMO equivalent to
void main(int start, int end, float * restrict c, float * restrict a,
float * restrict b)
{
const int width = 256;
for (int i = start ; i < end ; ++i ) {
c[ i ] = a[ i ] + b[ i ];
c[ width + i ] = a[ width + i ] + b[ width + i ];
}
}
With this version, the vectorizer doesnt complain about a bad stride and
can parallelize the loop.
Here the output from "clang -emit-llvm -S loop.c" which can be parallelized:
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define void @bar(float* noalias %c, float* noalias %a, float* noalias
%b, i32 %start, i32 %end) #0 {
entry:
%c.addr = alloca float*, align 8
%a.addr = alloca float*, align 8
%b.addr = alloca float*, align 8
%start.addr = alloca i32, align 4
%end.addr = alloca i32, align 4
%width = alloca i32, align 4
%i = alloca i32, align 4
store float* %c, float** %c.addr, align 8
store float* %a, float** %a.addr, align 8
store float* %b, float** %b.addr, align 8
store i32 %start, i32* %start.addr, align 4
store i32 %end, i32* %end.addr, align 4
store i32 256, i32* %width, align 4
%0 = load i32* %start.addr, align 4
store i32 %0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%1 = load i32* %i, align 4
%2 = load i32* %end.addr, align 4
%cmp = icmp slt i32 %1, %2
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%3 = load i32* %i, align 4
%idxprom = sext i32 %3 to i64
%4 = load float** %a.addr, align 8
%arrayidx = getelementptr inbounds float* %4, i64 %idxprom
%5 = load float* %arrayidx, align 4
%6 = load i32* %i, align 4
%idxprom1 = sext i32 %6 to i64
%7 = load float** %b.addr, align 8
%arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
%8 = load float* %arrayidx2, align 4
%add = fadd float %5, %8
%9 = load i32* %i, align 4
%idxprom3 = sext i32 %9 to i64
%10 = load float** %c.addr, align 8
%arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
store float %add, float* %arrayidx4, align 4
%11 = load i32* %i, align 4
%add5 = add nsw i32 256, %11
%idxprom6 = sext i32 %add5 to i64
%12 = load float** %a.addr, align 8
%arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
%13 = load float* %arrayidx7, align 4
%14 = load i32* %i, align 4
%add8 = add nsw i32 256, %14
%idxprom9 = sext i32 %add8 to i64
%15 = load float** %b.addr, align 8
%arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
%16 = load float* %arrayidx10, align 4
%add11 = fadd float %13, %16
%17 = load i32* %i, align 4
%add12 = add nsw i32 256, %17
%idxprom13 = sext i32 %add12 to i64
%18 = load float** %c.addr, align 8
%arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
store float %add11, float* %arrayidx14, align 4
br label %for.inc
for.inc: ; preds = %for.body
%19 = load i32* %i, align 4
%inc = add nsw i32 %19, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-nans-fp-math"="false"
"stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}
Any ideas why the vectotizer doesn't like my code?
Frank
More information about the llvm-dev
mailing list