[llvm-bugs] [Bug 34468] New: LoopVectorizer creates a dead loop when ShortTripCount=32, MaxVF=64

Mon Sep 4 15:14:27 PDT 2017

https://bugs.llvm.org/show_bug.cgi?id=34468

            Bug ID: 34468
           Summary: LoopVectorizer creates a dead loop when
                    ShortTripCount=32, MaxVF=64
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: zvi.rackover at intel.com
                CC: llvm-bugs at lists.llvm.org

Here's another case where we mess-up vectorization of a loop with a constant
trip count. This time for MaxVL=64, TripCount=32 (which is larger than the
TinyTripCount threshold, but still lower than MaxVL). Thanks to Ayal for
mentioning this kind of case in https://reviews.llvm.org/D37425.

***************************************************************************
 target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
                     target triple = "x86_64-apple-macosx10.8.0"                
 define void @small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly
%B) {
 entry:
   br label %for.body

 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
   %0 = load i8, i8* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
   %arrayidx2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
   %1 = load i8, i8* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %add = add i8 %0, %1
   store i8 %add, i8* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 32
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4

 for.end:
   ret void
 }

 !3 = !{!3}
 !4 = !{!4}
***************************************************************************

opt -loop-vectorize -S -mcpu=skx 
Results with:
***************************************************************************
define void @small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly
%B) #0 {
entry:
  br i1 true, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %entry
  br label %vector.body

vector.body:                                      ; preds = %vector.body,
%vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %broadcast.splatinsert = insertelement <64 x i64> undef, i64 %index, i32 0
  %broadcast.splat = shufflevector <64 x i64> %broadcast.splatinsert, <64 x
i64> undef, <64 x i32> zeroinitializer
  %induction = add <64 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3,
i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64
14, i64 15, i64 16, i64 17, i64 18
, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64
28, i64 29, i64 30, i64 31, i64 32, i64 33, i64 34, i64 35, i64 36, i64 37, i64
38, i64 39, i64 40, i64 41, i64
 42, i64 43, i64 44, i64 45, i64 46, i64 47, i64 48, i64 49, i64 50, i64 51,
i64 52, i64 53, i64 54, i64 55, i64 56, i64 57, i64 58, i64 59, i64 60, i64 61,
i64 62, i64 63>
  %0 = add i64 %index, 0
  %1 = getelementptr inbounds i8, i8* %B, i64 %0
  %2 = getelementptr i8, i8* %1, i32 0
  %3 = bitcast i8* %2 to <64 x i8>*
  %wide.load = load <64 x i8>, <64 x i8>* %3, align 4
  %4 = getelementptr inbounds i8, i8* %A, i64 %0
  %5 = getelementptr i8, i8* %4, i32 0
  %6 = bitcast i8* %5 to <64 x i8>*
  %wide.load1 = load <64 x i8>, <64 x i8>* %6, align 4
  %7 = add <64 x i8> %wide.load, %wide.load1
  %8 = bitcast i8* %5 to <64 x i8>*
  store <64 x i8> %7, <64 x i8>* %8, align 4
  %index.next = add i64 %index, 64
  %9 = icmp eq i64 %index.next, 0
  br i1 %9, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 32, 0
  br i1 %cmp.n, label %for.end, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block,
%entry
  %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %entry ]
  br label %for.body

for.body:                                         ; preds = %for.body,
%scalar.ph
  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,
%for.body ]
  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
  %10 = load i8, i8* %arrayidx, align 4, !llvm.mem.parallel_loop_access !2
  %arrayidx2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
  %11 = load i8, i8* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !2
  %add = add i8 %10, %11
  store i8 %add, i8* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !2
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp eq i64 %indvars.iv.next, 32
  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3

for.end:                                          ; preds = %middle.block,
%for.body
  ret void
}
***************************************************************************

Note the first instruction is a jmp with a constant condition which will always
skip the vector loop.
VL was chosen as 64 which is larger than the loop trip count, so good thing we
never enter that vector loop, but it is unfortunate that the loop is not
vectorized to a more practical VL such as 32.
Running the same opt command as above only with -march=core-avx2 results with a
reachable vectorized loop. So this looks like another case where we need to
limit MaxVL based on a known trip-count.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170904/58b30fdc/attachment-0001.html>