[llvm-dev] Understanding Loop Vectorized IR

vivek pandya via llvm-dev llvm-dev at lists.llvm.org
Sat Sep 14 09:15:23 PDT 2019


Hello,
For the C code given below:

#include<stdio.h>
int a=0;
int d() {
  int e = 2;
  for (a = 0; a <= 8; a++)
    ;
  return e;
}
void main() {
  int f = 0;
  d();
  printf("%d\n",a);
}

$clang -O3 -c -emit-llvm  -mllvm -disable-llvm-optzns small.c

$opt -gvn -licm -loop-rotate -loop-vectorize   small.bc -o small-opt.bc

I see vectorized IR as follow:
; Function Attrs: nounwind uwtable
define dso_local i32 @d() #0 {
entry:
  %e = alloca i32, align 4
  %0 = bitcast i32* %e to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
  store i32 2, i32* %e, align 4, !tbaa !2
  store i32 0, i32* @a, align 4, !tbaa !2
  %a.promoted = load i32, i32* @a, align 4, !tbaa !2
  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %entry
  %vector.recur.init = insertelement <16 x i32> undef, i32 %a.promoted, i32 15
  br label %vector.body

vector.body:                                      ; preds =
%vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vector.recur = phi <16 x i32> [ %vector.recur.init, %vector.ph ], [
%17, %vector.body ]
  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32
5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
  %1 = add i32 %index, 0
  %2 = add i32 %index, 1
  %3 = add i32 %index, 2
  %4 = add i32 %index, 3
  %5 = add i32 %index, 4
  %6 = add i32 %index, 5
  %7 = add i32 %index, 6
  %8 = add i32 %index, 7
  %9 = add i32 %index, 8
  %10 = add i32 %index, 9
  %11 = add i32 %index, 10
  %12 = add i32 %index, 11
  %13 = add i32 %index, 12
  %14 = add i32 %index, 13
  %15 = add i32 %index, 14
  %16 = add i32 %index, 15
  %17 = add nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32
1, i32 1>
  %18 = shufflevector <16 x i32> %vector.recur, <16 x i32> %17, <16 x
i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
  %19 = icmp ule <16 x i32> %vec.ind, <i32 9, i32 9, i32 9, i32 9, i32
9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32
9, i32 9>
  %index.next = add i32 %index, 16
  %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16,
i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32
16, i32 16, i32 16, i32 16, i32 16>
  %20 = icmp eq i32 %index.next, 16
  br i1 %20, label %middle.block, label %vector.body, !llvm.loop !6

middle.block:                                     ; preds = %vector.body
  %vector.recur.extract = extractelement <16 x i32> %17, i32 15
  %vector.recur.extract.for.phi = extractelement <16 x i32> %17, i32 14
  br i1 true, label %for.end, label %scalar.ph

scalar.ph:                                        ; preds =
%middle.block, %entry
  %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block
], [ %a.promoted, %entry ]
  %bc.resume.val = phi i32 [ 16, %middle.block ], [ 0, %entry ]
  br label %for.cond

for.cond:                                         ; preds = %for.cond,
%scalar.ph
  %scalar.recur = phi i32 [ %inc, %for.cond ], [ %scalar.recur.init,
%scalar.ph ]
  %21 = phi i32 [ %inc, %for.cond ], [ %bc.resume.val, %scalar.ph ]
  %cmp = icmp sle i32 %21, 8
  %inc = add nsw i32 %21, 1
  br i1 %cmp, label %for.cond, label %for.end, !llvm.loop !8

for.end:                                          ; preds =
%middle.block, %for.cond
  %inc1.lcssa = phi i32 [ %scalar.recur, %for.cond ], [
%vector.recur.extract.for.phi, %middle.block ]
  store i32 %inc1.lcssa, i32* @a, align 4, !tbaa !2
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
  ret i32 2
}

As highlighted above few instructions result are never used can
someone explain why?

Also This gives output (variable a) 15 which is incorrect as output
should 9. However I don't see any problem with vectorized code and
hence a = 15 is not surprising.

So solution to this problem is that loop should have never vectorized?

-Vivek
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190914/ec4a2dac/attachment-0001.html>


More information about the llvm-dev mailing list