[LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass

Hal Finkel hfinkel at anl.gov
Thu Jan 26 15:20:49 PST 2012


On Thu, 2012-01-26 at 15:49 -0600, Sebastian Pop wrote:
> On Thu, Jan 26, 2012 at 3:41 PM, Hal Finkel <hfinkel at anl.gov> wrote:
> > On Thu, 2012-01-26 at 15:36 -0600, Sebastian Pop wrote:
> >> arm-none-linux-gnueabi

For what cpu are you compiling?

I think this may be a case where not having information on exactly what
can be vectorized on the backend my be hurting us. The LLVM output looks
okay (attached), but it may be that the post-legalization optimizations
are just not good enough to undo the damage done by an unfortunate
selection of instructions to vectorize. The options available in the
pass currently are fairly coarse, but please try setting them as
appropriate for your cpu and see if that makes a difference:

-bb-vectorize-aligned-only - Only generate aligned loads and stores
-bb-vectorize-no-casts - Don't try to vectorize casting (conversion)
operations
-bb-vectorize-no-floats - Don't try to vectorize floating-point values
-bb-vectorize-no-fma - Don't try to vectorize the fused-multiply-add
intrinsic
-bb-vectorize-no-ints - Don't try to vectorize integer values
-bb-vectorize-no-math - Don't try to vectorize floating-point math
intrinsics
-bb-vectorize-no-mem-ops - Don't try to vectorize loads and stores
-bb-vectorize-vector-bits=<uint> - The size of the native vector
registers (128 is the default)

> >
> > Indeed, adding -ccc-host-triple arm-none-linux-gnueabi I also get
> 
> Minor remark: please use -target instead of -ccc-host-triple that is
> now deprecated.

Correct, thanks!

 -Hal

> 
> Thanks for looking at this testcase.
> Sebastian
> --
> Qualcomm Innovation Center, Inc is a member of Code Aurora Forum

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory
-------------- next part --------------
; ModuleID = 'test-s-20120126.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
target triple = "armv4t-none-linux-gnueabi"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.timeval = type { i32, i32 }
%struct.timezone = type { i32, i32 }

@stdout = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [35 x i8] c"kernel execution time: %18.9f sec\0A\00", align 1

define i32 @main() nounwind {
entry:
  %start = alloca %struct.timeval, align 4
  %end = alloca %struct.timeval, align 4
  %call = call noalias i8* @malloc(i32 30000) nounwind
  %call1 = call noalias i8* @malloc(i32 30000) nounwind
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ]
  %conv = trunc i32 %i.068 to i8
  %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1
  store i8 %conv, i8* %x.067, align 1, !tbaa !0
  %inc = add nsw i32 %i.068, 1
  %exitcond70 = icmp eq i32 %inc, 30000
  br i1 %exitcond70, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone* null) nounwind
  br label %for.cond7.preheader

for.cond7.preheader:                              ; preds = %for.inc45, %for.end
  %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ]
  br label %for.body10

for.body10:                                       ; preds = %for.body10, %for.cond7.preheader
  %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41, %for.body10 ]
  %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ]
  %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10 ]
  %incdec.ptr11 = getelementptr inbounds i8* %r.063, i32 1
  %0 = load i8* %r.063, align 1, !tbaa !0
  %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2
  %1 = load i8* %incdec.ptr11, align 1, !tbaa !0
  %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3
  %2 = load i8* %incdec.ptr12, align 1, !tbaa !0
  %conv14 = zext i8 %0 to i32
  %mul = mul nsw i32 %conv14, 123
  %conv15 = zext i8 %1 to i32
  %mul16 = mul nsw i32 %conv15, 321
  %conv17 = zext i8 %2 to i32
  %mul18 = mul nsw i32 %conv17, 567
  %add = add i32 %mul16, %mul
  %add19 = add i32 %add, %mul18
  %conv20 = trunc i32 %add19 to i8
  %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1
  store i8 %conv20, i8* %w.065, align 1, !tbaa !0
  %mul23 = mul nsw i32 %conv14, 234
  %mul25 = mul nsw i32 %conv15, 432
  %mul28 = mul nsw i32 %conv17, 987
  %add26 = add i32 %mul25, %mul23
  %add29 = add i32 %add26, %mul28
  %conv30 = trunc i32 %add29 to i8
  %incdec.ptr31 = getelementptr inbounds i8* %w.065, i32 2
  store i8 %conv30, i8* %incdec.ptr21, align 1, !tbaa !0
  %mul33 = mul nsw i32 %conv14, 345
  %mul35 = mul nsw i32 %conv15, 543
  %mul38 = mul nsw i32 %conv17, 789
  %add36 = add i32 %mul35, %mul33
  %add39 = add i32 %add36, %mul38
  %conv40 = trunc i32 %add39 to i8
  %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3
  store i8 %conv40, i8* %incdec.ptr31, align 1, !tbaa !0
  %inc43 = add nsw i32 %i.164, 1
  %exitcond = icmp eq i32 %inc43, 10000
  br i1 %exitcond, label %for.inc45, label %for.body10

for.inc45:                                        ; preds = %for.body10
  %inc46 = add nsw i32 %k.066, 1
  %exitcond69 = icmp eq i32 %inc46, 10000
  br i1 %exitcond69, label %for.end47, label %for.cond7.preheader

for.end47:                                        ; preds = %for.inc45
  %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone* null) nounwind
  %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0
  %3 = load i32* %tv_sec, align 4, !tbaa !2
  %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0
  %4 = load i32* %tv_sec49, align 4, !tbaa !2
  %sub = sub nsw i32 %3, %4
  %mul50 = mul nsw i32 %sub, 1000000
  %conv51 = sext i32 %mul50 to i64
  %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1
  %5 = load i32* %tv_usec, align 4, !tbaa !2
  %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1
  %6 = load i32* %tv_usec53, align 4, !tbaa !2
  %sub54 = sub nsw i32 %5, %6
  %conv55 = sext i32 %sub54 to i64
  %add56 = add i64 %conv55, %conv51
  %7 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3
  %conv57 = uitofp i64 %add56 to double
  %div = fdiv double %conv57, 1.000000e+06
  %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div) nounwind
  %arrayidx = getelementptr inbounds i8* %call1, i32 12
  %8 = load i8* %arrayidx, align 1, !tbaa !0
  %conv59 = zext i8 %8 to i32
  %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988
  %9 = load i8* %arrayidx60, align 1, !tbaa !0
  %conv61 = zext i8 %9 to i32
  %add62 = add nsw i32 %conv61, %conv59
  ret i32 %add62
}

declare noalias i8* @malloc(i32) nounwind

declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind

declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind

!0 = metadata !{metadata !"omnipotent char", metadata !1}
!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
!2 = metadata !{metadata !"long", metadata !0}
!3 = metadata !{metadata !"any pointer", metadata !0}
-------------- next part --------------
; ModuleID = 'test-s-20120126.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
target triple = "armv4t-none-linux-gnueabi"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.timeval = type { i32, i32 }
%struct.timezone = type { i32, i32 }

@stdout = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [35 x i8] c"kernel execution time: %18.9f sec\0A\00", align 1

define i32 @main() nounwind {
entry:
  %start = alloca %struct.timeval, align 4
  %end = alloca %struct.timeval, align 4
  %call = call noalias i8* @malloc(i32 30000) nounwind
  %call1 = call noalias i8* @malloc(i32 30000) nounwind
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %i.068 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %x.067 = phi i8* [ %call, %entry ], [ %incdec.ptr, %for.body ]
  %conv = trunc i32 %i.068 to i8
  %incdec.ptr = getelementptr inbounds i8* %x.067, i32 1
  store i8 %conv, i8* %x.067, align 1, !tbaa !0
  %inc = add nsw i32 %i.068, 1
  %exitcond70 = icmp eq i32 %inc, 30000
  br i1 %exitcond70, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  %call2 = call i32 @gettimeofday(%struct.timeval* %start, %struct.timezone* null) nounwind
  br label %for.cond7.preheader

for.cond7.preheader:                              ; preds = %for.inc45, %for.end
  %k.066 = phi i32 [ 0, %for.end ], [ %inc46, %for.inc45 ]
  br label %for.body10

for.body10:                                       ; preds = %for.body10, %for.cond7.preheader
  %w.065 = phi i8* [ %call1, %for.cond7.preheader ], [ %incdec.ptr41, %for.body10 ]
  %i.164 = phi i32 [ 0, %for.cond7.preheader ], [ %inc43, %for.body10 ]
  %r.063 = phi i8* [ %call, %for.cond7.preheader ], [ %incdec.ptr13, %for.body10 ]
  %0 = bitcast i8* %r.063 to <2 x i8>*
  %incdec.ptr12 = getelementptr inbounds i8* %r.063, i32 2
  %1 = load <2 x i8>* %0, align 1, !tbaa !0
  %incdec.ptr13 = getelementptr inbounds i8* %r.063, i32 3
  %2 = load i8* %incdec.ptr12, align 1, !tbaa !0
  %conv14 = zext <2 x i8> %1 to <2 x i32>
  %mul = mul nsw <2 x i32> %conv14, <i32 123, i32 321>
  %mul.v.r1 = extractelement <2 x i32> %mul, i32 0
  %mul.v.r2 = extractelement <2 x i32> %mul, i32 1
  %conv17 = zext i8 %2 to i32
  %mul18 = mul nsw i32 %conv17, 567
  %add = add i32 %mul.v.r2, %mul.v.r1
  %add19 = add i32 %add, %mul18
  %conv20 = trunc i32 %add19 to i8
  %incdec.ptr21 = getelementptr inbounds i8* %w.065, i32 1
  store i8 %conv20, i8* %w.065, align 1, !tbaa !0
  %mul28 = mul nsw i32 %conv17, 987
  %mul25.v.i0 = shufflevector <2 x i32> %conv14, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
  %mul25 = mul nsw <2 x i32> %mul25.v.i0, <i32 432, i32 345>
  %mul23 = mul nsw <2 x i32> %conv14, <i32 234, i32 543>
  %mul38 = mul nsw i32 %conv17, 789
  %add26.v.i1 = shufflevector <2 x i32> %mul23, <2 x i32> %mul25, <2 x i32> <i32 0, i32 3>
  %add26.v.i0 = shufflevector <2 x i32> %mul25, <2 x i32> %mul23, <2 x i32> <i32 0, i32 3>
  %add26 = add <2 x i32> %add26.v.i0, %add26.v.i1
  %add29.v.i1.1 = insertelement <2 x i32> undef, i32 %mul28, i32 0
  %add29.v.i1.2 = insertelement <2 x i32> %add29.v.i1.1, i32 %mul38, i32 1
  %add29 = add <2 x i32> %add26, %add29.v.i1.2
  %conv30 = trunc <2 x i32> %add29 to <2 x i8>
  %3 = bitcast i8* %incdec.ptr21 to <2 x i8>*
  %incdec.ptr41 = getelementptr inbounds i8* %w.065, i32 3
  store <2 x i8> %conv30, <2 x i8>* %3, align 1, !tbaa !0
  %inc43 = add nsw i32 %i.164, 1
  %exitcond = icmp eq i32 %inc43, 10000
  br i1 %exitcond, label %for.inc45, label %for.body10

for.inc45:                                        ; preds = %for.body10
  %inc46 = add nsw i32 %k.066, 1
  %exitcond69 = icmp eq i32 %inc46, 10000
  br i1 %exitcond69, label %for.end47, label %for.cond7.preheader

for.end47:                                        ; preds = %for.inc45
  %call48 = call i32 @gettimeofday(%struct.timeval* %end, %struct.timezone* null) nounwind
  %tv_sec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 0
  %4 = load i32* %tv_sec, align 4, !tbaa !2
  %tv_sec49 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 0
  %5 = load i32* %tv_sec49, align 4, !tbaa !2
  %sub = sub nsw i32 %4, %5
  %mul50 = mul nsw i32 %sub, 1000000
  %conv51 = sext i32 %mul50 to i64
  %tv_usec = getelementptr inbounds %struct.timeval* %end, i32 0, i32 1
  %6 = load i32* %tv_usec, align 4, !tbaa !2
  %tv_usec53 = getelementptr inbounds %struct.timeval* %start, i32 0, i32 1
  %7 = load i32* %tv_usec53, align 4, !tbaa !2
  %sub54 = sub nsw i32 %6, %7
  %conv55 = sext i32 %sub54 to i64
  %add56 = add i64 %conv55, %conv51
  %8 = load %struct._IO_FILE** @stdout, align 4, !tbaa !3
  %conv57 = uitofp i64 %add56 to double
  %div = fdiv double %conv57, 1.000000e+06
  %call58 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([35 x i8]* @.str, i32 0, i32 0), double %div) nounwind
  %arrayidx = getelementptr inbounds i8* %call1, i32 12
  %9 = load i8* %arrayidx, align 1, !tbaa !0
  %conv59 = zext i8 %9 to i32
  %arrayidx60 = getelementptr inbounds i8* %call1, i32 9988
  %10 = load i8* %arrayidx60, align 1, !tbaa !0
  %conv61 = zext i8 %10 to i32
  %add62 = add nsw i32 %conv61, %conv59
  ret i32 %add62
}

declare noalias i8* @malloc(i32) nounwind

declare i32 @gettimeofday(%struct.timeval*, %struct.timezone*) nounwind

declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) nounwind

!0 = metadata !{metadata !"omnipotent char", metadata !1}
!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
!2 = metadata !{metadata !"long", metadata !0}
!3 = metadata !{metadata !"any pointer", metadata !0}


More information about the llvm-dev mailing list