[LLVMdev] [llvm] r184698 - Add a flag to defer vectorization into a phase after the inliner and its

Wed Jun 26 09:14:30 PDT 2013

On Jun 26, 2013, at 10:14 AM, Andrew Trick <atrick at apple.com> wrote:
> On Jun 25, 2013, at 8:14 AM, Hal Finkel <hfinkel at anl.gov> wrote:
> 
>> ----- Original Message -----
> SE "drops" flags internally because we don't have a safe way to express them. But why is Indvars dropping the IR flags? We should:
> 1) determine if its safe for Indvars to preserve nsw in the cases we miss now   
> 2) if not we may want a loop level analysis to preserve certain facts like trip count (hopefully not needed)
> 3) defer parts of Indvars that are destructive or even reorganize loop passes. E.g.linear function test replace could as late as LSR. 

So that I remember when I come back to work on this let me give an example why currently just moving loop-vectorize would result in us missing some opportunities (without fixing stuff). Note, that I am not saying that this is not fixable: 
For example, by doing some additional analysis on the accesses - we do know the loop bounds after all in the example below - so we can just ask SCEV whether “2*60+1 < max_i32” in the loop vectorizer. In the case where we don’t statically know that this is true we could emit runtime checks. Maybe we can also fix the widen'ing of induction variables that IndVars is performing that is dropping the flags.

I just want want to point out that when we do our experiments now - without fixing this - this is a source of potential lost opportunities. 

Having said that here is an example of this issue that we will run into right now if we just move the vectorizer to later:

struct {
  int A[128];
  int B[128];
} S;

void test() {
  for (int i = 0; i < 60; ++i) {
    S.A[2*i] = S.A[2*i+1];
  }
}

Currently, if we run “clang -O3 -debug-only=loop-vectorize"

The cached SCEV info that we get tells us that the two accesses don’t wrap (the vectorizer needs this information to know it is safe to perform vectorization):

LV: Src Scev: {(4 + @S),+,8}<nsw><%for.body>Sink Scev: {@S,+,8}<nsw><%for.body>(Induction step: 2)

However the IR coming into the vectorizer looks somewhat like the following:

===
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
%struct.anon = type { [128 x i32], [128 x i32] }

@S = common global %struct.anon zeroinitializer, align 4

; Function Attrs: nounwind ssp uwtable
define void @test() #0 {
entry:
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %0 = shl nsw i64 %indvars.iv, 1
  %1 = or i64 %0, 1
  %arrayidx = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %1
  %2 = load i32* %arrayidx, align 4, !tbaa !0
  %arrayidx3 = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %0
  store i32 %2, i32* %arrayidx3, align 4, !tbaa !0
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp ne i32 %lftr.wideiv, 60
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

!0 = metadata !{metadata !"int", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA”}
===

If we run opt -loop-vectorize < IR.ll we will not vectorize the loop because SCEV now does not deduce that the accesses don’t wrap:

LV: {(4 + @S),+,8}<%for.body>Sink Scev: {@S,+,8}<%for.body>(Induction step: 0)

The point where we loose the flag is in Indvars when we widen the type from i32->i64:

*** IR Dump After Loop-Closed SSA Form Pass ***
for.body:                                         ; preds = %entry, %for.body
  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %mul = shl nsw i32 %i.08, 1
  %add7 = or i32 %mul, 1
  %idxprom = sext i32 %add7 to i64
  %arrayidx = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %idxprom
  %0 = load i32* %arrayidx, align 4, !tbaa !0
  %idxprom2 = sext i32 %mul to i64
  %arrayidx3 = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %idxprom2
  store i32 %0, i32* %arrayidx3, align 4, !tbaa !0
  %inc = add nsw i32 %i.08, 1   <<< // Flag present.
  %cmp = icmp slt i32 %inc, 60
  br i1 %cmp, label %for.body, label %for.end

Dump just before LFTR:

define void @test(i32 %N) #0 {
entry:
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %0 = shl nsw i64 %indvars.iv, 1
  %mul = shl nsw i32 %i.08, 1
  %1 = or i64 %0, 1
  %add7 = or i32 %mul, 1
  %idxprom = sext i32 %add7 to i64
  %arrayidx = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %1
  %2 = load i32* %arrayidx, align 4, !tbaa !0
  %idxprom2 = sext i32 %mul to i64
  %arrayidx3 = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %0
  store i32 %2, i32* %arrayidx3, align 4, !tbaa !0
  %indvars.iv.next = add i64 %indvars.iv, 1  <<< // LOST
  %inc = add nsw i32 %i.08, 1
  %3 = trunc i64 %indvars.iv.next to i32
  %cmp = icmp slt i32 %3, 60
  br i1 %cmp, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

*** IR Dump After Induction Variable Simplification ***
for.body:                                         ; preds = %entry, %for.body
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %0 = shl nsw i64 %indvars.iv, 1
  %1 = or i64 %0, 1
  %arrayidx = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %1
  %2 = load i32* %arrayidx, align 4, !tbaa !0
  %arrayidx3 = getelementptr inbounds %struct.anon* @S, i64 0, i32 0, i64 %0
  store i32 %2, i32* %arrayidx3, align 4, !tbaa !0
  %indvars.iv.next = add i64 %indvars.iv, 1 <<< // HERE
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp ne i32 %lftr.wideiv, 60
  br i1 %exitcond, label %for.body, label %for.end