[LLVMdev] Enabling vectorization with LLVM 3.3 for a DSL emitting LLVM IR
Stéphane Letz
letz at grame.fr
Fri Jul 5 05:37:38 PDT 2013
Le 5 juil. 2013 à 04:11, Tobias Grosser <tobias at grosser.es> a écrit :
> On 07/04/2013 01:39 PM, Stéphane Letz wrote:
>> Hi,
>>
>> Our DSL can generate C or directly generate LLVM IR. With LLVM 3.3, we can vectorize the C produced code using clang with -O3, or clang with -O1 then opt -O3 -vectorize-loops. But the same program generating LLVM IR version cannot be vectorized with opt -O3 -vectorize-loops. So our guess is that our generated LLVM IR lacks some informations that are needed by the vectorization passes to correctly work.
>>
>> Any idea of what could be lacking?
>
> Without any knowledge about the code guessing is hard. You may miss the 'noalias' keyword or nsw/nuw flags, but there are many possibilities.
>
> If you add '-debug' to opt you may get some hints. Also, if you have a small test case, posting the LLVM-IR may help.
>
> Cheers,
> Tobias
>
Hi Tobias,
1) Here is a simple C loop generated by our C backend:
void computemydsp(mydsp* dsp, int count, float** inputs, float** outputs) {
float* input0 = inputs[0];
float* input1 = inputs[1];
float* output0 = outputs[0];
/* C99 loop */
{
int i;
for (i = 0; (i < count); i = (i + 1)) {
output0[i] = (float)((float)input0[i] + (float)input1[i]);
}
}
}
2) Compiling it with "clang -O3" vectorize it directly:
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
%0 = load float** %inputs, align 8, !tbaa !3
%arrayidx1 = getelementptr inbounds float** %inputs, i64 1
%1 = load float** %arrayidx1, align 8, !tbaa !3
%2 = load float** %outputs, align 8, !tbaa !3
%cmp14 = icmp sgt i32 %count, 0
br i1 %cmp14, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%cnt.cast = zext i32 %count to i64
%n.vec = and i64 %cnt.cast, 4294967288
%cmp.zero = icmp eq i64 %n.vec, 0
%3 = add i32 %count, -1
%4 = zext i32 %3 to i64
%scevgep = getelementptr float* %2, i64 %4
br i1 %cmp.zero, label %middle.block, label %vector.memcheck
vector.memcheck: ; preds = %for.body.lr.ph
%scevgep19 = getelementptr float* %1, i64 %4
%scevgep17 = getelementptr float* %0, i64 %4
%bound122 = icmp ule float* %1, %scevgep
%bound021 = icmp ule float* %2, %scevgep19
%bound1 = icmp ule float* %0, %scevgep
%bound0 = icmp ule float* %2, %scevgep17
%found.conflict23 = and i1 %bound021, %bound122
%found.conflict = and i1 %bound0, %bound1
%conflict.rdx = or i1 %found.conflict, %found.conflict23
br i1 %conflict.rdx, label %middle.block, label %vector.body
vector.body: ; preds = %vector.memcheck, %vector.body
%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
%5 = getelementptr inbounds float* %0, i64 %index
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>* %6, align 4
%.sum32 = or i64 %index, 4
%7 = getelementptr float* %0, i64 %.sum32
%8 = bitcast float* %7 to <4 x float>*
%wide.load25 = load <4 x float>* %8, align 4
%9 = getelementptr inbounds float* %1, i64 %index
%10 = bitcast float* %9 to <4 x float>*
%wide.load26 = load <4 x float>* %10, align 4
%.sum33 = or i64 %index, 4
%11 = getelementptr float* %1, i64 %.sum33
%12 = bitcast float* %11 to <4 x float>*
%wide.load27 = load <4 x float>* %12, align 4
%13 = fadd <4 x float> %wide.load, %wide.load26
%14 = fadd <4 x float> %wide.load25, %wide.load27
%15 = getelementptr inbounds float* %2, i64 %index
%16 = bitcast float* %15 to <4 x float>*
store <4 x float> %13, <4 x float>* %16, align 4
%.sum34 = or i64 %index, 4
%17 = getelementptr float* %2, i64 %.sum34
%18 = bitcast float* %17 to <4 x float>*
store <4 x float> %14, <4 x float>* %18, align 4
%index.next = add i64 %index, 8
%19 = icmp eq i64 %index.next, %n.vec
br i1 %19, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body, %vector.memcheck, %for.body.lr.ph
%resume.val = phi i64 [ 0, %for.body.lr.ph ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
%cmp.n = icmp eq i64 %cnt.cast, %resume.val
br i1 %cmp.n, label %for.end, label %for.body
for.body: ; preds = %middle.block, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
%arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
%20 = load float* %arrayidx3, align 4, !tbaa !4
%arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
%21 = load float* %arrayidx5, align 4, !tbaa !4
%add = fadd float %20, %21
%arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
store float %add, float* %arrayidx7, align 4, !tbaa !4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %count
br i1 %exitcond, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5
for.end: ; preds = %middle.block, %for.body, %entry
ret void
}
; Function Attrs: nounwind ssp uwtable
define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
entry:
ret i32 0
}
3) compiling it with "clang -O1"
; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
%0 = load float** %inputs, align 8, !tbaa !3
%arrayidx1 = getelementptr inbounds float** %inputs, i64 1
%1 = load float** %arrayidx1, align 8, !tbaa !3
%2 = load float** %outputs, align 8, !tbaa !3
%cmp14 = icmp sgt i32 %count, 0
br i1 %cmp14, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
%3 = load float* %arrayidx3, align 4, !tbaa !4
%arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
%4 = load float* %arrayidx5, align 4, !tbaa !4
%add = fadd float %3, %4
%arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
store float %add, float* %arrayidx7, align 4, !tbaa !4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
4) then using "opt -o3 -vectorize-loops" vectorize it:
; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
%0 = load float** %inputs, align 8, !tbaa !3
%arrayidx1 = getelementptr inbounds float** %inputs, i64 1
%1 = load float** %arrayidx1, align 8, !tbaa !3
%2 = load float** %outputs, align 8, !tbaa !3
%cmp14 = icmp sgt i32 %count, 0
br i1 %cmp14, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
%cnt.cast = zext i32 %count to i64
%3 = urem i32 %count, 24
%n.mod.vf = zext i32 %3 to i64
%n.vec = sub i64 %cnt.cast, %n.mod.vf
%cmp.zero = icmp eq i32 %3, %count
%4 = add i32 %count, -1
%5 = zext i32 %4 to i64
%scevgep = getelementptr float* %2, i64 %5
br i1 %cmp.zero, label %middle.block, label %vector.memcheck
vector.memcheck: ; preds = %for.body.preheader
%scevgep6 = getelementptr float* %1, i64 %5
%scevgep4 = getelementptr float* %0, i64 %5
%bound19 = icmp ule float* %1, %scevgep
%bound08 = icmp ule float* %2, %scevgep6
%bound1 = icmp ule float* %0, %scevgep
%bound0 = icmp ule float* %2, %scevgep4
%found.conflict10 = and i1 %bound08, %bound19
%found.conflict = and i1 %bound0, %bound1
%conflict.rdx = or i1 %found.conflict, %found.conflict10
br i1 %conflict.rdx, label %middle.block, label %vector.body
vector.body: ; preds = %vector.memcheck, %vector.body
%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
%6 = getelementptr inbounds float* %0, i64 %index
%7 = bitcast float* %6 to <8 x float>*
%wide.load = load <8 x float>* %7, align 4
%.sum = add i64 %index, 8
%8 = getelementptr float* %0, i64 %.sum
%9 = bitcast float* %8 to <8 x float>*
%wide.load13 = load <8 x float>* %9, align 4
%.sum23 = add i64 %index, 16
%10 = getelementptr float* %0, i64 %.sum23
%11 = bitcast float* %10 to <8 x float>*
%wide.load14 = load <8 x float>* %11, align 4
%12 = getelementptr inbounds float* %1, i64 %index
%13 = bitcast float* %12 to <8 x float>*
%wide.load15 = load <8 x float>* %13, align 4
%.sum24 = add i64 %index, 8
%14 = getelementptr float* %1, i64 %.sum24
%15 = bitcast float* %14 to <8 x float>*
%wide.load16 = load <8 x float>* %15, align 4
%.sum25 = add i64 %index, 16
%16 = getelementptr float* %1, i64 %.sum25
%17 = bitcast float* %16 to <8 x float>*
%wide.load17 = load <8 x float>* %17, align 4
%18 = fadd <8 x float> %wide.load, %wide.load15
%19 = fadd <8 x float> %wide.load13, %wide.load16
%20 = fadd <8 x float> %wide.load14, %wide.load17
%21 = getelementptr inbounds float* %2, i64 %index
%22 = bitcast float* %21 to <8 x float>*
store <8 x float> %18, <8 x float>* %22, align 4
%.sum26 = add i64 %index, 8
%23 = getelementptr float* %2, i64 %.sum26
%24 = bitcast float* %23 to <8 x float>*
store <8 x float> %19, <8 x float>* %24, align 4
%.sum27 = add i64 %index, 16
%25 = getelementptr float* %2, i64 %.sum27
%26 = bitcast float* %25 to <8 x float>*
store <8 x float> %20, <8 x float>* %26, align 4
%index.next = add i64 %index, 24
%27 = icmp eq i64 %index.next, %n.vec
br i1 %27, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body, %vector.memcheck, %for.body.preheader
%resume.val = phi i64 [ 0, %for.body.preheader ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
%cmp.n = icmp eq i64 %cnt.cast, %resume.val
br i1 %cmp.n, label %for.end, label %for.body
for.body: ; preds = %middle.block, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
%arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
%28 = load float* %arrayidx3, align 4, !tbaa !4
%arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
%29 = load float* %arrayidx5, align 4, !tbaa !4
%add = fadd float %28, %29
%arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
store float %add, float* %arrayidx7, align 4, !tbaa !4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv1 = trunc i64 %indvars.iv.next to i32
%exitcond2 = icmp eq i32 %lftr.wideiv1, %count
br i1 %exitcond2, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5
for.end: ; preds = %middle.block, %for.body, %entry
ret void
}
5) producing LLVM IR with our LLVM backend :
define void @compute_mydsp(%struct.dsp_mydsp* %dsp, i32 %fullcount, float** noalias %inputs, float** noalias %outputs) {
block_code:
br label %code_block
code_block: ; preds = %block_code
%0 = getelementptr inbounds float** %inputs, i32 0
%1 = load float** %0
%2 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
store float* %1, float** %2
%fInput0 = alloca float*
%3 = getelementptr inbounds float** %inputs, i32 1
%4 = load float** %3
%5 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
store float* %4, float** %5
%fInput1 = alloca float*
%6 = getelementptr inbounds float** %outputs, i32 0
%7 = load float** %6
%8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
store float* %7, float** %8
%fOutput0 = alloca float*
br label %init_block
init_block: ; preds = %code_block
%index = alloca i32
store i32 0, i32* %index
br label %exec_block
exec_block: ; preds = %exit_block6, %init_block
%index1 = phi i32 [ 0, %init_block ], [ %next_index9, %exit_block6 ]
%9 = load i32* %index
%10 = icmp slt i32 %9, %fullcount
%11 = select i1 %10, i32 1, i32 0
%12 = trunc i32 %11 to i1
br i1 %12, label %loop_body_block, label %exit_block
loop_body_block: ; preds = %exec_block
br label %code_block2
exit_block: ; preds = %exec_block
br label %return
code_block2: ; preds = %loop_body_block
%13 = load i32* %index
%14 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
%15 = load float** %14
%16 = getelementptr inbounds float* %15, i32 %13
store float* %16, float** %fInput0
%17 = load i32* %index
%18 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
%19 = load float** %18
%20 = getelementptr inbounds float* %19, i32 %17
store float* %20, float** %fInput1
%21 = load i32* %index
%22 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
%23 = load float** %22
%24 = getelementptr inbounds float* %23, i32 %21
store float* %24, float** %fOutput0
%count = alloca i32
%25 = load i32* %index
%26 = sub i32 %fullcount, %25
%27 = icmp slt i32 32, %26
%28 = select i1 %27, i32 32, i32 %26
store i32 %28, i32* %count
br label %init_block3
init_block3: ; preds = %code_block2
%i = alloca i32
store i32 0, i32* %i
br label %exec_block4
exec_block4: ; preds = %code_block8, %init_block3
%i7 = phi i32 [ 0, %init_block3 ], [ %next_index, %code_block8 ]
%29 = load i32* %i
%30 = load i32* %count
%31 = icmp slt i32 %29, %30
%32 = select i1 %31, i32 1, i32 0
%33 = trunc i32 %32 to i1
br i1 %33, label %loop_body_block5, label %exit_block6
loop_body_block5: ; preds = %exec_block4
br label %code_block8
exit_block6: ; preds = %exec_block4
%34 = load i32* %index
%next_index9 = add i32 %34, 32
store i32 %next_index9, i32* %index
br label %exec_block
code_block8: ; preds = %loop_body_block5
%35 = load i32* %i
%36 = load float** %fOutput0
%37 = getelementptr inbounds float* %36, i32 %35
%38 = load i32* %i
%39 = load float** %fInput0
%40 = getelementptr inbounds float* %39, i32 %38
%41 = load float* %40
%42 = load i32* %i
%43 = load float** %fInput1
%44 = getelementptr inbounds float* %43, i32 %42
%45 = load float* %44
%46 = fadd float %41, %45
store float %46, float* %37
%47 = load i32* %i
%next_index = add i32 %47, 1
store i32 %next_index, i32* %i
br label %exec_block4
return: ; preds = %exit_block
ret void
}
6) Then using "opt -o3 -vectorize-loops" *does not* vectorize it:
; Function Attrs: nounwind
define void @compute_mydsp(%struct.dsp_mydsp* nocapture %dsp, i32 %fullcount, float** noalias nocapture %inputs, float** noalias nocapture %outputs) #0 {
block_code:
%0 = load float** %inputs
%1 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
store float* %0, float** %1
%2 = getelementptr inbounds float** %inputs, i32 1
%3 = load float** %2
%4 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
store float* %3, float** %4
%5 = load float** %outputs
%6 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
store float* %5, float** %6
%7 = icmp sgt i32 %fullcount, 0
br i1 %7, label %code_block2.lr.ph, label %return
code_block2.lr.ph: ; preds = %block_code
%8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
%9 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
%10 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
br label %code_block2
code_block2: ; preds = %exit_block6, %code_block2.lr.ph
%next_index95 = phi i32 [ 0, %code_block2.lr.ph ], [ %next_index9, %exit_block6 ]
%11 = load float** %8
%12 = load float** %9
%13 = load float** %10
%14 = sub i32 %fullcount, %next_index95
%15 = icmp sgt i32 %14, 32
%16 = select i1 %15, i32 32, i32 %14
%17 = icmp sgt i32 %16, 0
br i1 %17, label %code_block8, label %exit_block6
exit_block6: ; preds = %code_block8, %code_block2
%next_index9 = add i32 %next_index95, 32
%18 = icmp slt i32 %next_index9, %fullcount
br i1 %18, label %code_block2, label %return
code_block8: ; preds = %code_block2, %code_block8
%next_index3 = phi i32 [ %next_index, %code_block8 ], [ 0, %code_block2 ]
%.sum = add i32 %next_index95, %next_index3
%19 = getelementptr inbounds float* %13, i32 %.sum
%.sum8 = add i32 %next_index95, %next_index3
%20 = getelementptr inbounds float* %11, i32 %.sum8
%21 = load float* %20
%22 = getelementptr inbounds float* %12, i32 %.sum
%23 = load float* %22
%24 = fadd float %21, %23
store float %24, float* %19
%next_index = add i32 %next_index3, 1
%25 = icmp slt i32 %next_index, %16
br i1 %25, label %code_block8, label %exit_block6
return: ; preds = %exit_block6, %block_code
ret void
}
Any idea what is wrong then?
Thanks
Stéphane Letz
More information about the llvm-dev
mailing list