<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><span class="vcard"><a class="email" href="mailto:florian_hahn@apple.com" title="Florian Hahn <florian_hahn@apple.com>"> <span class="fn">Florian Hahn</span></a>

</span> changed

          <a class="bz_bug_link 

          bz_status_RESOLVED  bz_closed"

   title="RESOLVED FIXED - Code explosion when LoopVectorize vectorizes a loop with a stride of 8 floats, part 2"

   href="https://bugs.llvm.org/show_bug.cgi?id=27881">bug 27881</a>

          <br>

             <table border="1" cellspacing="0" cellpadding="8">

          <tr>

            <th>What</th>

            <th>Removed</th>

            <th>Added</th>

          </tr>

         <tr>

           <td style="text-align:right;">Status</td>

           <td>NEW

           </td>

           <td>RESOLVED

           </td>

         </tr>

         <tr>

           <td style="text-align:right;">Resolution</td>

           <td>---

           </td>

           <td>FIXED

           </td>

         </tr>

         <tr>

           <td style="text-align:right;">CC</td>

           <td>

           </td>

           <td>florian_hahn@apple.com

           </td>

         </tr></table>

      <p>

        <div>

            <b><a class="bz_bug_link 

          bz_status_RESOLVED  bz_closed"

   title="RESOLVED FIXED - Code explosion when LoopVectorize vectorizes a loop with a stride of 8 floats, part 2"

   href="https://bugs.llvm.org/show_bug.cgi?id=27881#c16">Comment # 16</a>

              on <a class="bz_bug_link 

          bz_status_RESOLVED  bz_closed"

   title="RESOLVED FIXED - Code explosion when LoopVectorize vectorizes a loop with a stride of 8 floats, part 2"

   href="https://bugs.llvm.org/show_bug.cgi?id=27881">bug 27881</a>

              from <span class="vcard"><a class="email" href="mailto:florian_hahn@apple.com" title="Florian Hahn <florian_hahn@apple.com>"> <span class="fn">Florian Hahn</span></a>

</span></b>

        <pre>Looks like this has been fixed in trunk. Please re-open if it is still an

issue.

For the IR snippet, we now get the code below. The vectorised version seems to

be around 3x faster on my X86 box.

source_filename = "tc2.ll"

target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

target triple = "x86_64-apple-macosx10.11.0"

define float @foo(float* %a, float* %b, i32 %n) #0 {

entry:

  %cmp113 = icmp sgt i32 %n, 0

  br i1 %cmp113, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry

  %t0 = sext i32 %n to i64

  %0 = add i64 %t0, -1

  %1 = lshr i64 %0, 3

  %2 = add nuw nsw i64 %1, 1

  %min.iters.check = icmp ule i64 %2, 16

  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %for.body.preheader

  %n.mod.vf = urem i64 %2, 16

  %3 = icmp eq i64 %n.mod.vf, 0

  %4 = select i1 %3, i64 16, i64 %n.mod.vf

  %n.vec = sub i64 %2, %4

  %ind.end = mul i64 %n.vec, 8

  br label %vector.body

vector.body:                                      ; preds = %vector.body,

%vector.ph

  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]

  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %41,

%vector.body ]

  %vec.phi4 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %42,

%vector.body ]

  %vec.phi5 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %43,

%vector.body ]

  %vec.phi6 = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %44,

%vector.body ]

  %offset.idx = mul i64 %index, 8

  %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32

0

  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64>

undef, <4 x i32> zeroinitializer

  %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 8, i64 16, i64 24>

  %induction1 = add <4 x i64> %broadcast.splat, <i64 32, i64 40, i64 48, i64

56>

  %induction2 = add <4 x i64> %broadcast.splat, <i64 64, i64 72, i64 80, i64

88>

  %induction3 = add <4 x i64> %broadcast.splat, <i64 96, i64 104, i64 112, i64

120>

  %5 = add i64 %offset.idx, 0

  %6 = add i64 %offset.idx, 32

  %7 = add i64 %offset.idx, 64

  %8 = add i64 %offset.idx, 96

  %9 = getelementptr inbounds float, float* %a, i64 %5

  %10 = getelementptr inbounds float, float* %a, i64 %6

  %11 = getelementptr inbounds float, float* %a, i64 %7

  %12 = getelementptr inbounds float, float* %a, i64 %8

  %13 = getelementptr inbounds float, float* %9, i32 0

  %14 = bitcast float* %13 to <32 x float>*

  %15 = getelementptr inbounds float, float* %10, i32 0

  %16 = bitcast float* %15 to <32 x float>*

  %17 = getelementptr inbounds float, float* %11, i32 0

  %18 = bitcast float* %17 to <32 x float>*

  %19 = getelementptr inbounds float, float* %12, i32 0

  %20 = bitcast float* %19 to <32 x float>*

  %wide.vec = load <32 x float>, <32 x float>* %14, align 4

  %wide.vec7 = load <32 x float>, <32 x float>* %16, align 4

  %wide.vec8 = load <32 x float>, <32 x float>* %18, align 4

  %wide.vec9 = load <32 x float>, <32 x float>* %20, align 4

  %strided.vec = shufflevector <32 x float> %wide.vec, <32 x float> undef, <4 x

i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec10 = shufflevector <32 x float> %wide.vec7, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec11 = shufflevector <32 x float> %wide.vec8, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec12 = shufflevector <32 x float> %wide.vec9, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %21 = getelementptr inbounds float, float* %b, i64 %5

  %22 = getelementptr inbounds float, float* %b, i64 %6

  %23 = getelementptr inbounds float, float* %b, i64 %7

  %24 = getelementptr inbounds float, float* %b, i64 %8

  %25 = getelementptr inbounds float, float* %21, i32 0

  %26 = bitcast float* %25 to <32 x float>*

  %27 = getelementptr inbounds float, float* %22, i32 0

  %28 = bitcast float* %27 to <32 x float>*

  %29 = getelementptr inbounds float, float* %23, i32 0

  %30 = bitcast float* %29 to <32 x float>*

  %31 = getelementptr inbounds float, float* %24, i32 0

  %32 = bitcast float* %31 to <32 x float>*

  %wide.vec13 = load <32 x float>, <32 x float>* %26, align 4

  %wide.vec14 = load <32 x float>, <32 x float>* %28, align 4

  %wide.vec15 = load <32 x float>, <32 x float>* %30, align 4

  %wide.vec16 = load <32 x float>, <32 x float>* %32, align 4

  %strided.vec17 = shufflevector <32 x float> %wide.vec13, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec18 = shufflevector <32 x float> %wide.vec14, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec19 = shufflevector <32 x float> %wide.vec15, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %strided.vec20 = shufflevector <32 x float> %wide.vec16, <32 x float> undef,

<4 x i32> <i32 0, i32 8, i32 16, i32 24>

  %33 = fadd fast <4 x float> %vec.phi, <float 1.000000e+00, float

1.000000e+00, float 1.000000e+00, float 1.000000e+00>

  %34 = fadd fast <4 x float> %vec.phi4, <float 1.000000e+00, float

1.000000e+00, float 1.000000e+00, float 1.000000e+00>

  %35 = fadd fast <4 x float> %vec.phi5, <float 1.000000e+00, float

1.000000e+00, float 1.000000e+00, float 1.000000e+00>

  %36 = fadd fast <4 x float> %vec.phi6, <float 1.000000e+00, float

1.000000e+00, float 1.000000e+00, float 1.000000e+00>

  %37 = fadd fast <4 x float> %33, %strided.vec

  %38 = fadd fast <4 x float> %34, %strided.vec10

  %39 = fadd fast <4 x float> %35, %strided.vec11

  %40 = fadd fast <4 x float> %36, %strided.vec12

  %41 = fadd fast <4 x float> %37, %strided.vec17

  %42 = fadd fast <4 x float> %38, %strided.vec18

  %43 = fadd fast <4 x float> %39, %strided.vec19

  %44 = fadd fast <4 x float> %40, %strided.vec20

  %index.next = add i64 %index, 16

  %45 = icmp eq i64 %index.next, %n.vec

  br i1 %45, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %vector.body

  %bin.rdx = fadd fast <4 x float> %42, %41

  %bin.rdx21 = fadd fast <4 x float> %43, %bin.rdx

  %bin.rdx22 = fadd fast <4 x float> %44, %bin.rdx21

  %rdx.shuf = shufflevector <4 x float> %bin.rdx22, <4 x float> undef, <4 x

i32> <i32 2, i32 3, i32 undef, i32 undef>

  %bin.rdx23 = fadd fast <4 x float> %bin.rdx22, %rdx.shuf

  %rdx.shuf24 = shufflevector <4 x float> %bin.rdx23, <4 x float> undef, <4 x

i32> <i32 1, i32 undef, i32 undef, i32 undef>

  %bin.rdx25 = fadd fast <4 x float> %bin.rdx23, %rdx.shuf24

  %46 = extractelement <4 x float> %bin.rdx25, i32 0

  %cmp.n = icmp eq i64 %2, %n.vec

  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block,

%for.body.preheader

  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 0,

%for.body.preheader ]

  %bc.merge.rdx = phi float [ 0.000000e+00, %for.body.preheader ], [ %46,

%middle.block ]

  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %middle.block,

%for.body

  %add5.lcssa = phi float [ %add5, %for.body ], [ %46, %middle.block ]

  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds =

%for.cond.cleanup.loopexit, %entry

  %s.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add5.lcssa,

%for.cond.cleanup.loopexit ]

  ret float %s.0.lcssa

for.body:                                         ; preds = %for.body,

%scalar.ph

  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,

%for.body ]

  %s = phi float [ %bc.merge.rdx, %scalar.ph ], [ %add5, %for.body ]

  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv

  %t1 = load float, float* %arrayidx, align 4

  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv

  %t2 = load float, float* %arrayidx3, align 4

  %add = fadd fast float %s, 1.000000e+00

  %add4 = fadd fast float %add, %t1

  %add5 = fadd fast float %add4, %t2

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8

  %cmp1 = icmp slt i64 %indvars.iv.next, %t0

  br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !2

}

attributes #0 = { "target-features"="+avx" }

!0 = distinct !{!0, !1}

!1 = !{!"llvm.loop.isvectorized", i32 1}

!2 = distinct !{!2, !3, !1}

!3 = !{!"llvm.loop.unroll.runtime.disable"}</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>