[LLVMdev] loop vectorizer: this loop is not worth vectorizing

Thu Oct 31 20:27:50 PDT 2013

I am trying a setup where the one loop is rewritten as two loops. This 
avoids the 'rem' and 'div' instructions in the index calculation (which 
give the loop vectorizer a hard time).

However, with this setup the loop vectorizer complains about a too small 
loop.

LV: Checking a loop in "main"
LV: Found a loop: L3
LV: Found a loop with a very small trip count. This loop is not worth 
vectorizing.
LV: Not vectorizing.

Here the IR:

define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* 
noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
entrypoint:
   br i1 %arg2, label %L0, label %L2

L0:                                               ; preds = %entrypoint
   %0 = add nsw i64 %arg0, %arg3
   %1 = add nsw i64 %arg1, %arg3
   br label %L2

L2:                                               ; preds = %entrypoint, %L0
   %2 = phi i64 [ %0, %L0 ], [ %arg0, %entrypoint ]
   %3 = phi i64 [ %1, %L0 ], [ %arg1, %entrypoint ]
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   br label %L5

L3:                                               ; preds = %L3, %L5
   %6 = phi i64 [ %21, %L3 ], [ 0, %L5 ]
   %7 = add nsw i64 %26, %6
   %8 = add nsw i64 %27, %6
   %9 = getelementptr float* %arg5, i64 %7
   %10 = load float* %9, align 4
   %11 = getelementptr float* %arg5, i64 %8
   %12 = load float* %11, align 4
   %13 = getelementptr float* %arg6, i64 %7
   %14 = load float* %13, align 4
   %15 = getelementptr float* %arg6, i64 %8
   %16 = load float* %15, align 4
   %17 = fadd float %16, %12
   %18 = fadd float %14, %10
   %19 = getelementptr float* %arg4, i64 %7
   store float %18, float* %19, align 4
   %20 = getelementptr float* %arg4, i64 %8
   store float %17, float* %20, align 4
   %21 = add nsw i64 %6, 1
   %22 = icmp sgt i64 %6, 2
   br i1 %22, label %L4, label %L3

L4:                                               ; preds = %L3
   %23 = add nsw i64 %25, 1
   %24 = icmp slt i64 %23, %5
   br i1 %24, label %L5, label %L6

L5:                                               ; preds = %L4, %L2
   %25 = phi i64 [ %23, %L4 ], [ %4, %L2 ]
   %26 = shl i64 %25, 3
   %27 = or i64 %26, 4
   br label %L3

L6:                                               ; preds = %L4
   ret void
}

The L3 loop has a trip count of 4. The L5 outer loop has a variable trip 
count depending on the functions arguments.

I cannot make the L3 loop larger so that the vectorizer might be happy, 
because this will again introduce 'rem' and 'div' in the index calculation.

I am using these passes:

functionPassManager->add(llvm::createBasicAliasAnalysisPass());
       functionPassManager->add(llvm::createLICMPass());
       functionPassManager->add(llvm::createGVNPass());
       functionPassManager->add(llvm::createLoopVectorizePass());
functionPassManager->add(llvm::createInstructionCombiningPass());
       functionPassManager->add(llvm::createEarlyCSEPass());
functionPassManager->add(llvm::createCFGSimplificationPass());

I am wondering, whether there might be pass I could issue before the 
loop vectorizer that transforms the code so that the vectorizer is 
happy. I am wondering because coming from a C function which tries to 
mimic the above IR

void bar(std::uint64_t start, std::uint64_t end, float * __restrict__  
c, float * __restrict__ a, float * __restrict__ b)
{
   const std::uint64_t inner = 4;
   for (std::uint64_t i = start/inner ; i < end/inner ; i++ ) {
     for (std::uint64_t q = 0 ; q < inner ; q++ ) {
       const std::uint64_t ir0 = ( i * 2 + 0 ) * inner + q;
       const std::uint64_t ir1 = ( i * 2 + 1 ) * inner + q;

       c[ ir0 ]         = a[ ir0 ]         + b[ ir0 ];
       c[ ir1 ]         = a[ ir1 ]         + b[ ir1 ];
     }
   }
}

the loop vectorizer complains as well, but the produced code is vectorized:

LV: Checking a loop in "_Z3barmmPfS_S_"
LV: Found a loop: for.body4
LV: Found an induction variable.
LV: Found unvectorizable type.
LV: Can't vectorize the instructions or CFG
LV: Not vectorizing.

; Function Attrs: nounwind uwtable
define void @_Z3barmmPfS_S_(i64 %start, i64 %end, float* noalias %c, 
float* noalias %a, float* noalias %b) #3 {
entry:
   %div = lshr i64 %start, 2
   %div1 = lshr i64 %end, 2
   %cmp9 = icmp ult i64 %div, %div1
   br i1 %cmp9, label %for.body4.preheader, label %for.end20

for.body4.preheader:                              ; preds = %entry
   br label %for.body4

for.body4:                                        ; preds = 
%for.body4.preheader, %for.body4
   %storemerge10 = phi i64 [ %inc19, %for.body4 ], [ %div, 
%for.body4.preheader ]
   %mul5 = shl i64 %storemerge10, 3
   %add82 = or i64 %mul5, 4
   %arrayidx = getelementptr inbounds float* %a, i64 %mul5
   %arrayidx11 = getelementptr inbounds float* %b, i64 %mul5
   %arrayidx13 = getelementptr inbounds float* %c, i64 %mul5
   %arrayidx14 = getelementptr inbounds float* %a, i64 %add82
   %arrayidx15 = getelementptr inbounds float* %b, i64 %add82
   %arrayidx17 = getelementptr inbounds float* %c, i64 %add82
   %0 = bitcast float* %arrayidx to <4 x float>*
   %1 = load <4 x float>* %0, align 4
   %2 = bitcast float* %arrayidx11 to <4 x float>*
   %3 = load <4 x float>* %2, align 4
   %4 = fadd <4 x float> %1, %3
   %5 = bitcast float* %arrayidx13 to <4 x float>*
   store <4 x float> %4, <4 x float>* %5, align 4
   %6 = bitcast float* %arrayidx14 to <4 x float>*
   %7 = load <4 x float>* %6, align 4
   %8 = bitcast float* %arrayidx15 to <4 x float>*
   %9 = load <4 x float>* %8, align 4
   %10 = fadd <4 x float> %7, %9
   %11 = bitcast float* %arrayidx17 to <4 x float>*
   store <4 x float> %10, <4 x float>* %11, align 4
   %inc19 = add i64 %storemerge10, 1
   %cmp = icmp ult i64 %inc19, %div1
   br i1 %cmp, label %for.body4, label %for.end20.loopexit

for.end20.loopexit:                               ; preds = %for.body4
   br label %for.end20

for.end20:                                        ; preds = 
%for.end20.loopexit, %entry
   ret void
}

But here the vectorization must have happened before. It's starting to 
get frustrating.

Frank