[llvm-bugs] [Bug 27881] Code explosion when LoopVectorize vectorizes a loop with a stride of 8 floats, part 2

Sun Oct 13 18:38:50 PDT 2019

https://bugs.llvm.org/show_bug.cgi?id=27881

Florian Hahn <florian_hahn at apple.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
         Resolution|---                         |FIXED
                 CC|                            |florian_hahn at apple.com

--- Comment #16 from Florian Hahn <florian_hahn at apple.com> ---
Looks like this has been fixed in trunk. Please re-open if it is still an
issue.

For the IR snippet, we now get the code below. The vectorised version seems to
be around 3x faster on my X86 box.

source_filename = "tc2.ll"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"

define float @foo(float* %a, float* %b, i32 %n) #0 {
entry:
  %cmp113 = icmp sgt i32 %n, 0
  br i1 %cmp113, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %t0 = sext i32 %n to i64
  %0 = add i64 %t0, -1
  %1 = lshr i64 %0, 3
  %2 = add nuw nsw i64 %1, 1
  %min.iters.check = icmp ule i64 %2, 16
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %for.body.preheader
  %n.mod.vf = urem i64 %2, 16
  %3 = icmp eq i64 %n.mod.vf, 0
  %4 = select i1 %3, i64 16, i64 %n.mod.vf
  %n.vec = sub i64 %2, %4
  %ind.end = mul i64 %n.vec, 8
  br label %vector.body

vector.body:                                      ; preds = %vector.body,
%vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %41,
%vector.body ]
  %vec.phi4 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %42,
%vector.body ]
  %vec.phi5 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %43,
%vector.body ]
  %vec.phi6 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %44,
%vector.body ]
  %offset.idx = mul i64 %index, 8
  %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32
0
  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64>
undef, <4 x i32> zeroinitializer
  %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 8, i64 16, i64 24>
  %induction1 = add <4 x i64> %broadcast.splat, <i64 32, i64 40, i64 48, i64
56>
  %induction2 = add <4 x i64> %broadcast.splat, <i64 64, i64 72, i64 80, i64
88>
  %induction3 = add <4 x i64> %broadcast.splat, <i64 96, i64 104, i64 112, i64
120>
  %5 = add i64 %offset.idx, 0
  %6 = add i64 %offset.idx, 32
  %7 = add i64 %offset.idx, 64
  %8 = add i64 %offset.idx, 96
  %9 = getelementptr inbounds float, float* %a, i64 %5
  %10 = getelementptr inbounds float, float* %a, i64 %6
  %11 = getelementptr inbounds float, float* %a, i64 %7
  %12 = getelementptr inbounds float, float* %a, i64 %8
  %13 = getelementptr inbounds float, float* %9, i32 0
  %14 = bitcast float* %13 to <32 x float>*
  %15 = getelementptr inbounds float, float* %10, i32 0
  %16 = bitcast float* %15 to <32 x float>*
  %17 = getelementptr inbounds float, float* %11, i32 0
  %18 = bitcast float* %17 to <32 x float>*
  %19 = getelementptr inbounds float, float* %12, i32 0
  %20 = bitcast float* %19 to <32 x float>*
  %wide.vec = load <32 x float>, <32 x float>* %14, align 4
  %wide.vec7 = load <32 x float>, <32 x float>* %16, align 4
  %wide.vec8 = load <32 x float>, <32 x float>* %18, align 4
  %wide.vec9 = load <32 x float>, <32 x float>* %20, align 4
  %strided.vec = shufflevector <32 x float> %wide.vec, <32 x float> undef, <4 x
i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec10 = shufflevector <32 x float> %wide.vec7, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec11 = shufflevector <32 x float> %wide.vec8, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec12 = shufflevector <32 x float> %wide.vec9, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %21 = getelementptr inbounds float, float* %b, i64 %5
  %22 = getelementptr inbounds float, float* %b, i64 %6
  %23 = getelementptr inbounds float, float* %b, i64 %7
  %24 = getelementptr inbounds float, float* %b, i64 %8
  %25 = getelementptr inbounds float, float* %21, i32 0
  %26 = bitcast float* %25 to <32 x float>*
  %27 = getelementptr inbounds float, float* %22, i32 0
  %28 = bitcast float* %27 to <32 x float>*
  %29 = getelementptr inbounds float, float* %23, i32 0
  %30 = bitcast float* %29 to <32 x float>*
  %31 = getelementptr inbounds float, float* %24, i32 0
  %32 = bitcast float* %31 to <32 x float>*
  %wide.vec13 = load <32 x float>, <32 x float>* %26, align 4
  %wide.vec14 = load <32 x float>, <32 x float>* %28, align 4
  %wide.vec15 = load <32 x float>, <32 x float>* %30, align 4
  %wide.vec16 = load <32 x float>, <32 x float>* %32, align 4
  %strided.vec17 = shufflevector <32 x float> %wide.vec13, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec18 = shufflevector <32 x float> %wide.vec14, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec19 = shufflevector <32 x float> %wide.vec15, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %strided.vec20 = shufflevector <32 x float> %wide.vec16, <32 x float> undef,
<4 x i32> <i32 0, i32 8, i32 16, i32 24>
  %33 = fadd fast <4 x float> %vec.phi, <float 1.000000e+00, float
1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %34 = fadd fast <4 x float> %vec.phi4, <float 1.000000e+00, float
1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %35 = fadd fast <4 x float> %vec.phi5, <float 1.000000e+00, float
1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %36 = fadd fast <4 x float> %vec.phi6, <float 1.000000e+00, float
1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %37 = fadd fast <4 x float> %33, %strided.vec
  %38 = fadd fast <4 x float> %34, %strided.vec10
  %39 = fadd fast <4 x float> %35, %strided.vec11
  %40 = fadd fast <4 x float> %36, %strided.vec12
  %41 = fadd fast <4 x float> %37, %strided.vec17
  %42 = fadd fast <4 x float> %38, %strided.vec18
  %43 = fadd fast <4 x float> %39, %strided.vec19
  %44 = fadd fast <4 x float> %40, %strided.vec20
  %index.next = add i64 %index, 16
  %45 = icmp eq i64 %index.next, %n.vec
  br i1 %45, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %vector.body
  %bin.rdx = fadd fast <4 x float> %42, %41
  %bin.rdx21 = fadd fast <4 x float> %43, %bin.rdx
  %bin.rdx22 = fadd fast <4 x float> %44, %bin.rdx21
  %rdx.shuf = shufflevector <4 x float> %bin.rdx22, <4 x float> undef, <4 x
i32> <i32 2, i32 3, i32 undef, i32 undef>
  %bin.rdx23 = fadd fast <4 x float> %bin.rdx22, %rdx.shuf
  %rdx.shuf24 = shufflevector <4 x float> %bin.rdx23, <4 x float> undef, <4 x
i32> <i32 1, i32 undef, i32 undef, i32 undef>
  %bin.rdx25 = fadd fast <4 x float> %bin.rdx23, %rdx.shuf24
  %46 = extractelement <4 x float> %bin.rdx25, i32 0
  %cmp.n = icmp eq i64 %2, %n.vec
  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block,
%for.body.preheader
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 0,
%for.body.preheader ]
  %bc.merge.rdx = phi float [ 0.000000e+00, %for.body.preheader ], [ %46,
%middle.block ]
  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %middle.block,
%for.body
  %add5.lcssa = phi float [ %add5, %for.body ], [ %46, %middle.block ]
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds =
%for.cond.cleanup.loopexit, %entry
  %s.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add5.lcssa,
%for.cond.cleanup.loopexit ]
  ret float %s.0.lcssa

for.body:                                         ; preds = %for.body,
%scalar.ph
  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,
%for.body ]
  %s = phi float [ %bc.merge.rdx, %scalar.ph ], [ %add5, %for.body ]
  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
  %t1 = load float, float* %arrayidx, align 4
  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
  %t2 = load float, float* %arrayidx3, align 4
  %add = fadd fast float %s, 1.000000e+00
  %add4 = fadd fast float %add, %t1
  %add5 = fadd fast float %add4, %t2
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
  br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !2
}

attributes #0 = { "target-features"="+avx" }

!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
!2 = distinct !{!2, !3, !1}
!3 = !{!"llvm.loop.unroll.runtime.disable"}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20191014/984dc9af/attachment-0001.html>