[llvm-commits] [LLVMdev] [PATCH] BasicBlock Autovectorization Pass

Mon Jan 23 20:13:40 PST 2012

On Tue, 2012-01-17 at 13:25 -0600, Sebastian Pop wrote:
> Hi,
> 
> On Fri, Dec 30, 2011 at 3:09 AM, Tobias Grosser <tobias at grosser.es> wrote:
> > As it seems my intuition is wrong, I am very eager to see and understand
> > an example where a search limit of 4000 is really needed.
> >
> 
> To make the ball roll again, I attached a testcase that can be tuned
> to understand the impact on compile time for different sizes of a
> basic block.  One can also set the number of iterations in the loop to
> 1 to test the vectorizer with no loops around.
> 
> Hal, could you please report the compile times with/without the
> vectorizer for different basic block sizes?

I've looked at your test case, and I am pleased to report a negligible
compile-time increase! Also, there is no vectorization of the main
loop :) Here's why: (as you know) the main part of the loop is
essentially one long dependency chain, and so there is nothing to
vectorize there. The only vectorization opportunities come from
unrolling the loop. Using the default thresholds, the loop will not even
partially unroll (because the body is too large). As a result,
essentially nothing happens.

I've prepared a reduced version of your test case (attached). Using
-unroll-threshold=300 (along with -unroll-allow-partial), I can make the
loop unroll partially (the reduced loop size is 110, so this allows
unrolling 2 iterations). Once this is done, the vectorizer finds
candidate pairs and vectorizes [as a practical manner, you need -basicaa
too].

I think that even this is probably too big for a regression test. I
don't think that the basic structure really adds anything over existing
tests (although I need to make sure that alias-analysis use is otherwise
covered), but I'll copy-and-paste a small portion into a regression test
to cover the search limit logic (which is currently uncovered). We
should probably discuss different situations that we'd like to see
covered in the regression suite (perhaps post-commit).

Thanks for working on this! I'll post an updated patch for review
shortly.

 -Hal

> 
> Once this parameter is tuned, could we get this code committed to llvm?
> 
> Thanks,
> Sebastian
> 
> PS: this testcase is also a compile time hog for GCC at -O3 when the
> loop vectorizer is running.
> 
> --
> Qualcomm Innovation Center, Inc is a member of Code Aurora Forum

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory
-------------- next part --------------
; ModuleID = '/homes/hfinkel/tmp/bbvect.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; This test case is based on code by Sebastian Pop.

define i32 @main() nounwind uwtable readnone {
entry:
  %A = alloca [100 x i32], align 16
  %B = alloca [100 x i32], align 16
  %C = alloca [100 x i32], align 16
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv21247 = phi i64 [ 0, %entry ], [ %indvars.iv.next21248, %for.body ]
  %arrayidx = getelementptr inbounds [100 x i32]* %A, i64 0, i64 %indvars.iv21247
  %0 = trunc i64 %indvars.iv21247 to i32
  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
  %1 = add nsw i64 %indvars.iv21247, 42
  %arrayidx2 = getelementptr inbounds [100 x i32]* %B, i64 0, i64 %indvars.iv21247
  %2 = trunc i64 %1 to i32
  store i32 %2, i32* %arrayidx2, align 4, !tbaa !0
  %3 = shl nsw i64 %indvars.iv21247, 1
  %arrayidx4 = getelementptr inbounds [100 x i32]* %C, i64 0, i64 %indvars.iv21247
  %4 = trunc i64 %3 to i32
  store i32 %4, i32* %arrayidx4, align 4, !tbaa !0
  %indvars.iv.next21248 = add i64 %indvars.iv21247, 1
  %lftr.wideiv21251 = trunc i64 %indvars.iv.next21248 to i32
  %exitcond21252 = icmp eq i32 %lftr.wideiv21251, 100
  br i1 %exitcond21252, label %do.body9, label %for.body

for.cond21230.preheader:                          ; preds = %do.body9
  %arrayidx21236 = getelementptr inbounds [100 x i32]* %B, i64 0, i64 99
  %arrayidx21234 = getelementptr inbounds [100 x i32]* %A, i64 0, i64 99
  %arrayidx21239 = getelementptr inbounds [100 x i32]* %C, i64 0, i64 99
  %5 = load i32* %arrayidx21236, align 4, !tbaa !0
  %6 = load i32* %arrayidx21234, align 4, !tbaa !0
  %add21237 = add i32 %5, %6
  %7 = load i32* %arrayidx21239, align 4, !tbaa !0
  %add21240 = add i32 %add21237, %7
  ret i32 %add21240

do.body9:                                         ; preds = %for.body, %do.body9
  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body9 ], [ 0, %for.body ]
  %arrayidx11 = getelementptr inbounds [100 x i32]* %B, i64 0, i64 %indvars.iv
  %8 = load i32* %arrayidx11, align 4, !tbaa !0
  %arrayidx13 = getelementptr inbounds [100 x i32]* %C, i64 0, i64 %indvars.iv
  %9 = load i32* %arrayidx13, align 4, !tbaa !0
  %add14 = add nsw i32 %9, %8
  %arrayidx16 = getelementptr inbounds [100 x i32]* %A, i64 0, i64 %indvars.iv
  %mul21 = mul nsw i32 %add14, %9
  %sub = sub nsw i32 %add14, %mul21
  %mul41 = mul nsw i32 %add14, %sub
  %sub48 = sub nsw i32 %add14, %mul41
  %mul62 = mul nsw i32 %add14, %sub48
  %sub69 = sub nsw i32 %add14, %mul62
  %mul83 = mul nsw i32 %add14, %sub69
  %sub90 = sub nsw i32 %add14, %mul83
  %mul104 = mul nsw i32 %add14, %sub90
  %sub111 = sub nsw i32 %add14, %mul104
  %mul125 = mul nsw i32 %add14, %sub111
  %sub132 = sub nsw i32 %add14, %mul125
  %mul146 = mul nsw i32 %add14, %sub132
  %sub153 = sub nsw i32 %add14, %mul146
  %mul167 = mul nsw i32 %add14, %sub153
  %sub174 = sub nsw i32 %add14, %mul167
  %mul188 = mul nsw i32 %add14, %sub174
  %sub195 = sub nsw i32 %add14, %mul188
  %mul209 = mul nsw i32 %add14, %sub195
  %sub216 = sub nsw i32 %add14, %mul209
  %mul231 = mul nsw i32 %add14, %sub216
  %sub238 = sub nsw i32 %add14, %mul231
  %mul252 = mul nsw i32 %add14, %sub238
  %sub259 = sub nsw i32 %add14, %mul252
  %mul273 = mul nsw i32 %add14, %sub259
  %sub280 = sub nsw i32 %add14, %mul273
  %mul294 = mul nsw i32 %add14, %sub280
  %sub301 = sub nsw i32 %add14, %mul294
  %mul315 = mul nsw i32 %add14, %sub301
  %sub322 = sub nsw i32 %add14, %mul315
  %mul336 = mul nsw i32 %add14, %sub322
  %sub343 = sub nsw i32 %add14, %mul336
  %mul357 = mul nsw i32 %add14, %sub343
  %sub364 = sub nsw i32 %add14, %mul357
  %mul378 = mul nsw i32 %add14, %sub364
  %sub385 = sub nsw i32 %add14, %mul378
  %mul399 = mul nsw i32 %add14, %sub385
  %sub406 = sub nsw i32 %add14, %mul399
  %mul420 = mul nsw i32 %add14, %sub406
  %sub427 = sub nsw i32 %add14, %mul420
  %mul443 = mul nsw i32 %add14, %sub427
  %sub450 = sub nsw i32 %add14, %mul443
  %mul464 = mul nsw i32 %add14, %sub450
  %sub471 = sub nsw i32 %add14, %mul464
  %mul485 = mul nsw i32 %add14, %sub471
  %sub492 = sub nsw i32 %add14, %mul485
  %mul506 = mul nsw i32 %add14, %sub492
  %sub513 = sub nsw i32 %add14, %mul506
  %mul527 = mul nsw i32 %add14, %sub513
  %sub534 = sub nsw i32 %add14, %mul527
  %mul548 = mul nsw i32 %add14, %sub534
  %sub555 = sub nsw i32 %add14, %mul548
  %mul569 = mul nsw i32 %add14, %sub555
  %sub576 = sub nsw i32 %add14, %mul569
  %mul590 = mul nsw i32 %add14, %sub576
  %sub597 = sub nsw i32 %add14, %mul590
  %mul611 = mul nsw i32 %add14, %sub597
  %sub618 = sub nsw i32 %add14, %mul611
  %mul632 = mul nsw i32 %add14, %sub618
  %sub639 = sub nsw i32 %add14, %mul632
  %mul655 = mul nsw i32 %add14, %sub639
  %sub662 = sub nsw i32 %add14, %mul655
  %mul676 = mul nsw i32 %add14, %sub662
  %sub683 = sub nsw i32 %add14, %mul676
  %mul697 = mul nsw i32 %add14, %sub683
  %sub704 = sub nsw i32 %add14, %mul697
  %mul718 = mul nsw i32 %add14, %sub704
  %sub725 = sub nsw i32 %add14, %mul718
  %mul739 = mul nsw i32 %add14, %sub725
  %sub746 = sub nsw i32 %add14, %mul739
  %mul760 = mul nsw i32 %add14, %sub746
  %sub767 = sub nsw i32 %add14, %mul760
  %mul781 = mul nsw i32 %add14, %sub767
  %sub788 = sub nsw i32 %add14, %mul781
  %mul802 = mul nsw i32 %add14, %sub788
  %sub809 = sub nsw i32 %add14, %mul802
  %mul823 = mul nsw i32 %add14, %sub809
  %sub830 = sub nsw i32 %add14, %mul823
  %mul844 = mul nsw i32 %add14, %sub830
  %sub851 = sub nsw i32 %add14, %mul844
  %mul867 = mul nsw i32 %add14, %sub851
  %sub874 = sub nsw i32 %add14, %mul867
  %mul888 = mul nsw i32 %add14, %sub874
  %sub895 = sub nsw i32 %add14, %mul888
  %mul909 = mul nsw i32 %add14, %sub895
  %sub916 = sub nsw i32 %add14, %mul909
  %mul930 = mul nsw i32 %add14, %sub916
  %sub937 = sub nsw i32 %add14, %mul930
  %mul951 = mul nsw i32 %add14, %sub937
  %sub958 = sub nsw i32 %add14, %mul951
  %mul972 = mul nsw i32 %add14, %sub958
  %sub979 = sub nsw i32 %add14, %mul972
  %mul993 = mul nsw i32 %add14, %sub979
  %sub21179 = sub nsw i32 %add14, %mul993
  %mul21193 = mul nsw i32 %add14, %sub21179
  %sub21200 = sub nsw i32 %add14, %mul21193
  store i32 %add14, i32* %arrayidx16, align 4, !tbaa !0
  %mul21214 = mul nsw i32 %add14, %sub21200
  store i32 %mul21214, i32* %arrayidx11, align 4, !tbaa !0
  %sub21221 = sub nsw i32 %add14, %mul21214
  store i32 %sub21221, i32* %arrayidx13, align 4, !tbaa !0
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, 100
  br i1 %exitcond, label %for.cond21230.preheader, label %do.body9
}

!0 = metadata !{metadata !"int", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA", null}