[LLVMdev] Enabling vectorization with LLVM 3.3 for a DSL emitting LLVM IR

Stéphane Letz letz at grame.fr
Fri Jul 5 05:37:38 PDT 2013

Le 5 juil. 2013 à 04:11, Tobias Grosser <tobias at grosser.es> a écrit :

> On 07/04/2013 01:39 PM, Stéphane Letz wrote:
>> Hi,
>> Our DSL can generate C or directly generate LLVM IR. With LLVM 3.3, we can vectorize the C produced code using clang with -O3, or clang with -O1 then opt -O3 -vectorize-loops. But the same program generating LLVM IR version cannot be vectorized with opt -O3 -vectorize-loops. So our guess is that our generated LLVM IR lacks some informations that are needed by the vectorization passes to correctly work.
>> Any idea of what could be lacking?
> Without any knowledge about the code guessing is hard. You may miss the 'noalias' keyword or nsw/nuw flags, but there are many possibilities.
> If you add '-debug' to opt you may get some hints. Also, if you have a small test case, posting the LLVM-IR may help.
> Cheers,
> Tobias

Hi Tobias,

1) Here is a simple C loop generated by our C backend:

void computemydsp(mydsp* dsp, int count, float** inputs, float** outputs) {
	float* input0 = inputs[0];
	float* input1 = inputs[1];
	float* output0 = outputs[0];
	/* C99 loop */
		int i;
		for (i = 0; (i < count); i = (i + 1)) {
			output0[i] = (float)((float)input0[i] + (float)input1[i]);

2) Compiling it with "clang -O3" vectorize it directly:

define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body.lr.ph, label %for.end

for.body.lr.ph:                                   ; preds = %entry
  %cnt.cast = zext i32 %count to i64
  %n.vec = and i64 %cnt.cast, 4294967288
  %cmp.zero = icmp eq i64 %n.vec, 0
  %3 = add i32 %count, -1
  %4 = zext i32 %3 to i64
  %scevgep = getelementptr float* %2, i64 %4
  br i1 %cmp.zero, label %middle.block, label %vector.memcheck

vector.memcheck:                                  ; preds = %for.body.lr.ph
  %scevgep19 = getelementptr float* %1, i64 %4
  %scevgep17 = getelementptr float* %0, i64 %4
  %bound122 = icmp ule float* %1, %scevgep
  %bound021 = icmp ule float* %2, %scevgep19
  %bound1 = icmp ule float* %0, %scevgep
  %bound0 = icmp ule float* %2, %scevgep17
  %found.conflict23 = and i1 %bound021, %bound122
  %found.conflict = and i1 %bound0, %bound1
  %conflict.rdx = or i1 %found.conflict, %found.conflict23
  br i1 %conflict.rdx, label %middle.block, label %vector.body

vector.body:                                      ; preds = %vector.memcheck, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
  %5 = getelementptr inbounds float* %0, i64 %index
  %6 = bitcast float* %5 to <4 x float>*
  %wide.load = load <4 x float>* %6, align 4
  %.sum32 = or i64 %index, 4
  %7 = getelementptr float* %0, i64 %.sum32
  %8 = bitcast float* %7 to <4 x float>*
  %wide.load25 = load <4 x float>* %8, align 4
  %9 = getelementptr inbounds float* %1, i64 %index
  %10 = bitcast float* %9 to <4 x float>*
  %wide.load26 = load <4 x float>* %10, align 4
  %.sum33 = or i64 %index, 4
  %11 = getelementptr float* %1, i64 %.sum33
  %12 = bitcast float* %11 to <4 x float>*
  %wide.load27 = load <4 x float>* %12, align 4
  %13 = fadd <4 x float> %wide.load, %wide.load26
  %14 = fadd <4 x float> %wide.load25, %wide.load27
  %15 = getelementptr inbounds float* %2, i64 %index
  %16 = bitcast float* %15 to <4 x float>*
  store <4 x float> %13, <4 x float>* %16, align 4
  %.sum34 = or i64 %index, 4
  %17 = getelementptr float* %2, i64 %.sum34
  %18 = bitcast float* %17 to <4 x float>*
  store <4 x float> %14, <4 x float>* %18, align 4
  %index.next = add i64 %index, 8
  %19 = icmp eq i64 %index.next, %n.vec
  br i1 %19, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body, %vector.memcheck, %for.body.lr.ph
  %resume.val = phi i64 [ 0, %for.body.lr.ph ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
  %cmp.n = icmp eq i64 %cnt.cast, %resume.val
  br i1 %cmp.n, label %for.end, label %for.body

for.body:                                         ; preds = %middle.block, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %20 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %21 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %20, %21
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %count
  br i1 %exitcond, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5

for.end:                                          ; preds = %middle.block, %for.body, %entry
  ret void

; Function Attrs: nounwind ssp uwtable
define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
  ret i32 0

3) compiling it with "clang -O1" 

; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body, label %for.end

for.body:                                         ; preds = %entry, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %3 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %4 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %3, %4
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %count
  br i1 %exitcond, label %for.end, label %for.body

for.end:                                          ; preds = %for.body, %entry
  ret void

4) then using "opt -o3 -vectorize-loops" vectorize it:

; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body.preheader, label %for.end

for.body.preheader:                               ; preds = %entry
  %cnt.cast = zext i32 %count to i64
  %3 = urem i32 %count, 24
  %n.mod.vf = zext i32 %3 to i64
  %n.vec = sub i64 %cnt.cast, %n.mod.vf
  %cmp.zero = icmp eq i32 %3, %count
  %4 = add i32 %count, -1
  %5 = zext i32 %4 to i64
  %scevgep = getelementptr float* %2, i64 %5
  br i1 %cmp.zero, label %middle.block, label %vector.memcheck

vector.memcheck:                                  ; preds = %for.body.preheader
  %scevgep6 = getelementptr float* %1, i64 %5
  %scevgep4 = getelementptr float* %0, i64 %5
  %bound19 = icmp ule float* %1, %scevgep
  %bound08 = icmp ule float* %2, %scevgep6
  %bound1 = icmp ule float* %0, %scevgep
  %bound0 = icmp ule float* %2, %scevgep4
  %found.conflict10 = and i1 %bound08, %bound19
  %found.conflict = and i1 %bound0, %bound1
  %conflict.rdx = or i1 %found.conflict, %found.conflict10
  br i1 %conflict.rdx, label %middle.block, label %vector.body

vector.body:                                      ; preds = %vector.memcheck, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
  %6 = getelementptr inbounds float* %0, i64 %index
  %7 = bitcast float* %6 to <8 x float>*
  %wide.load = load <8 x float>* %7, align 4
  %.sum = add i64 %index, 8
  %8 = getelementptr float* %0, i64 %.sum
  %9 = bitcast float* %8 to <8 x float>*
  %wide.load13 = load <8 x float>* %9, align 4
  %.sum23 = add i64 %index, 16
  %10 = getelementptr float* %0, i64 %.sum23
  %11 = bitcast float* %10 to <8 x float>*
  %wide.load14 = load <8 x float>* %11, align 4
  %12 = getelementptr inbounds float* %1, i64 %index
  %13 = bitcast float* %12 to <8 x float>*
  %wide.load15 = load <8 x float>* %13, align 4
  %.sum24 = add i64 %index, 8
  %14 = getelementptr float* %1, i64 %.sum24
  %15 = bitcast float* %14 to <8 x float>*
  %wide.load16 = load <8 x float>* %15, align 4
  %.sum25 = add i64 %index, 16
  %16 = getelementptr float* %1, i64 %.sum25
  %17 = bitcast float* %16 to <8 x float>*
  %wide.load17 = load <8 x float>* %17, align 4
  %18 = fadd <8 x float> %wide.load, %wide.load15
  %19 = fadd <8 x float> %wide.load13, %wide.load16
  %20 = fadd <8 x float> %wide.load14, %wide.load17
  %21 = getelementptr inbounds float* %2, i64 %index
  %22 = bitcast float* %21 to <8 x float>*
  store <8 x float> %18, <8 x float>* %22, align 4
  %.sum26 = add i64 %index, 8
  %23 = getelementptr float* %2, i64 %.sum26
  %24 = bitcast float* %23 to <8 x float>*
  store <8 x float> %19, <8 x float>* %24, align 4
  %.sum27 = add i64 %index, 16
  %25 = getelementptr float* %2, i64 %.sum27
  %26 = bitcast float* %25 to <8 x float>*
  store <8 x float> %20, <8 x float>* %26, align 4
  %index.next = add i64 %index, 24
  %27 = icmp eq i64 %index.next, %n.vec
  br i1 %27, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body, %vector.memcheck, %for.body.preheader
  %resume.val = phi i64 [ 0, %for.body.preheader ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
  %cmp.n = icmp eq i64 %cnt.cast, %resume.val
  br i1 %cmp.n, label %for.end, label %for.body

for.body:                                         ; preds = %middle.block, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %28 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %29 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %28, %29
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv1 = trunc i64 %indvars.iv.next to i32
  %exitcond2 = icmp eq i32 %lftr.wideiv1, %count
  br i1 %exitcond2, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5

for.end:                                          ; preds = %middle.block, %for.body, %entry
  ret void

5) producing LLVM IR with our LLVM backend :

define void @compute_mydsp(%struct.dsp_mydsp* %dsp, i32 %fullcount, float** noalias %inputs, float** noalias %outputs) {
  br label %code_block

code_block:                                       ; preds = %block_code
  %0 = getelementptr inbounds float** %inputs, i32 0
  %1 = load float** %0
  %2 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
  store float* %1, float** %2
  %fInput0 = alloca float*
  %3 = getelementptr inbounds float** %inputs, i32 1
  %4 = load float** %3
  %5 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
  store float* %4, float** %5
  %fInput1 = alloca float*
  %6 = getelementptr inbounds float** %outputs, i32 0
  %7 = load float** %6
  %8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
  store float* %7, float** %8
  %fOutput0 = alloca float*
  br label %init_block

init_block:                                       ; preds = %code_block
  %index = alloca i32
  store i32 0, i32* %index
  br label %exec_block

exec_block:                                       ; preds = %exit_block6, %init_block
  %index1 = phi i32 [ 0, %init_block ], [ %next_index9, %exit_block6 ]
  %9 = load i32* %index
  %10 = icmp slt i32 %9, %fullcount
  %11 = select i1 %10, i32 1, i32 0
  %12 = trunc i32 %11 to i1
  br i1 %12, label %loop_body_block, label %exit_block

loop_body_block:                                  ; preds = %exec_block
  br label %code_block2

exit_block:                                       ; preds = %exec_block
  br label %return

code_block2:                                      ; preds = %loop_body_block
  %13 = load i32* %index
  %14 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
  %15 = load float** %14
  %16 = getelementptr inbounds float* %15, i32 %13
  store float* %16, float** %fInput0
  %17 = load i32* %index
  %18 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
  %19 = load float** %18
  %20 = getelementptr inbounds float* %19, i32 %17
  store float* %20, float** %fInput1
  %21 = load i32* %index
  %22 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
  %23 = load float** %22
  %24 = getelementptr inbounds float* %23, i32 %21
  store float* %24, float** %fOutput0
  %count = alloca i32
  %25 = load i32* %index
  %26 = sub i32 %fullcount, %25
  %27 = icmp slt i32 32, %26
  %28 = select i1 %27, i32 32, i32 %26
  store i32 %28, i32* %count
  br label %init_block3

init_block3:                                      ; preds = %code_block2
  %i = alloca i32
  store i32 0, i32* %i
  br label %exec_block4

exec_block4:                                      ; preds = %code_block8, %init_block3
  %i7 = phi i32 [ 0, %init_block3 ], [ %next_index, %code_block8 ]
  %29 = load i32* %i
  %30 = load i32* %count
  %31 = icmp slt i32 %29, %30
  %32 = select i1 %31, i32 1, i32 0
  %33 = trunc i32 %32 to i1
  br i1 %33, label %loop_body_block5, label %exit_block6

loop_body_block5:                                 ; preds = %exec_block4
  br label %code_block8

exit_block6:                                      ; preds = %exec_block4
  %34 = load i32* %index
  %next_index9 = add i32 %34, 32
  store i32 %next_index9, i32* %index
  br label %exec_block

code_block8:                                      ; preds = %loop_body_block5
  %35 = load i32* %i
  %36 = load float** %fOutput0
  %37 = getelementptr inbounds float* %36, i32 %35
  %38 = load i32* %i
  %39 = load float** %fInput0
  %40 = getelementptr inbounds float* %39, i32 %38
  %41 = load float* %40
  %42 = load i32* %i
  %43 = load float** %fInput1
  %44 = getelementptr inbounds float* %43, i32 %42
  %45 = load float* %44
  %46 = fadd float %41, %45
  store float %46, float* %37
  %47 = load i32* %i
  %next_index = add i32 %47, 1
  store i32 %next_index, i32* %i
  br label %exec_block4

return:                                           ; preds = %exit_block
  ret void

6) Then using  "opt -o3 -vectorize-loops" *does not* vectorize it:

; Function Attrs: nounwind
define void @compute_mydsp(%struct.dsp_mydsp* nocapture %dsp, i32 %fullcount, float** noalias nocapture %inputs, float** noalias nocapture %outputs) #0 {
  %0 = load float** %inputs
  %1 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
  store float* %0, float** %1
  %2 = getelementptr inbounds float** %inputs, i32 1
  %3 = load float** %2
  %4 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
  store float* %3, float** %4
  %5 = load float** %outputs
  %6 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
  store float* %5, float** %6
  %7 = icmp sgt i32 %fullcount, 0
  br i1 %7, label %code_block2.lr.ph, label %return

code_block2.lr.ph:                                ; preds = %block_code
  %8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
  %9 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
  %10 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
  br label %code_block2

code_block2:                                      ; preds = %exit_block6, %code_block2.lr.ph
  %next_index95 = phi i32 [ 0, %code_block2.lr.ph ], [ %next_index9, %exit_block6 ]
  %11 = load float** %8
  %12 = load float** %9
  %13 = load float** %10
  %14 = sub i32 %fullcount, %next_index95
  %15 = icmp sgt i32 %14, 32
  %16 = select i1 %15, i32 32, i32 %14
  %17 = icmp sgt i32 %16, 0
  br i1 %17, label %code_block8, label %exit_block6

exit_block6:                                      ; preds = %code_block8, %code_block2
  %next_index9 = add i32 %next_index95, 32
  %18 = icmp slt i32 %next_index9, %fullcount
  br i1 %18, label %code_block2, label %return

code_block8:                                      ; preds = %code_block2, %code_block8
  %next_index3 = phi i32 [ %next_index, %code_block8 ], [ 0, %code_block2 ]
  %.sum = add i32 %next_index95, %next_index3
  %19 = getelementptr inbounds float* %13, i32 %.sum
  %.sum8 = add i32 %next_index95, %next_index3
  %20 = getelementptr inbounds float* %11, i32 %.sum8
  %21 = load float* %20
  %22 = getelementptr inbounds float* %12, i32 %.sum
  %23 = load float* %22
  %24 = fadd float %21, %23
  store float %24, float* %19
  %next_index = add i32 %next_index3, 1
  %25 = icmp slt i32 %next_index, %16
  br i1 %25, label %code_block8, label %exit_block6

return:                                           ; preds = %exit_block6, %block_code
  ret void

Any idea what is wrong then?


Stéphane Letz

