<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - if-conversion+vectorization leads to masked.gather instead of two masked.load's"

   href="https://llvm.org/bugs/show_bug.cgi?id=30786">30786</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>if-conversion+vectorization leads to masked.gather instead of  two masked.load's

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>zvi.rackover@intel.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider the following 'hello, world' for if-conversion+vectorization compiled

with clang -O3 -S -mllvm -march=skylake-avx512:

===============================================

void foo(float *restrict B, const float *restrict A, const float *restrict C,

          const float *restrict D, const float *restrict E) {

 #pragma clang loop vectorize(enable)

 #pragma clang loop unroll(disable)

   for (int i = 0; i < 4096; i++) {

     if (A[i] > 0) {

       B[i] = (E[i] * C[i]);

     } else {

       B[i] = (E[i] * D[i]);

     }

   }

 }

===============================================

The main issue shown below by the dumps after some of the interesting

transformations is that we are generating a

  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x

float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1

true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1

instead of two llvm.masked.load's (with complementary masks) which is more

efficient on X86. Just stating the obvious that the accesses to C[i] and D[i]

are contiguous over i, so the masked.load's are appropriate.

The vectorizer sees the IR after if-conversion happened earlier:

   %C.D = select i1 %cmp1, float* %C, float* %D

   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom

   %.pn = load float, float* %.pn.in, align 4, !tbaa !1

and will vectorize the load to a gather.

Following are the dumps after some transformations.

Before instcombine:

===============================================

define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*

noalias %D, float* noalias %E) local_unnamed_addr #0 {

 entry:

   br label %for.cond

 for.cond:                                         ; preds = %if.end, %entry

   %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]

   %cmp = icmp slt i32 %i.0, 4096

   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond

   ret void

 for.body:                                         ; preds = %for.cond

   %idxprom = sext i32 %i.0 to i64

   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom

   %0 = load float, float* %arrayidx, align 4, !tbaa !1

   %cmp1 = fcmp ogt float %0, 0.000000e+00

   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom

   %1 = load float, float* %arrayidx3, align 4, !tbaa !1

   br i1 %cmp1, label %if.then, label %if.else

 if.then:                                          ; preds = %for.body

   %arrayidx5 = getelementptr inbounds float, float* %C, i64 %idxprom

   %2 = load float, float* %arrayidx5, align 4, !tbaa !1

   %mul = fmul float %1, %2

   br label %if.end

 if.else:                                          ; preds = %for.body

   %arrayidx11 = getelementptr inbounds float, float* %D, i64 %idxprom

   %3 = load float, float* %arrayidx11, align 4, !tbaa !1

   %mul12 = fmul float %1, %3

   br label %if.end

 if.end:                                           ; preds = %if.else, %if.then

   %mul12.sink = phi float [ %mul12, %if.else ], [ %mul, %if.then ]

   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom

   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1

   %inc = add nsw i32 %i.0, 1

   br label %for.cond, !llvm.loop !5

 }

===============================================

After instcombine:

===============================================

define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*

noalias %D, float* noalias %E) local_unnamed_addr #0 {

 entry:

   br label %for.cond

 for.cond:                                         ; preds = %if.end, %entry

   %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]

   %cmp = icmp slt i32 %i.0, 4096

   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond

   ret void

 for.body:                                         ; preds = %for.cond

   %idxprom = sext i32 %i.0 to i64

   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom

   %0 = load float, float* %arrayidx, align 4, !tbaa !1

   %cmp1 = fcmp ogt float %0, 0.000000e+00

   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom

   %1 = load float, float* %arrayidx3, align 4, !tbaa !1

   br i1 %cmp1, label %if.then, label %if.else

 if.then:                                          ; preds = %for.body

   br label %if.end

 if.else:                                          ; preds = %for.body

   br label %if.end

 if.end:                                           ; preds = %if.else, %if.then

   %D.pn = phi float* [ %D, %if.else ], [ %C, %if.then ]

   %.pn.in = getelementptr inbounds float, float* %D.pn, i64 %idxprom

   %.pn = load float, float* %.pn.in, align 4, !tbaa !1

   %mul12.sink = fmul float %1, %.pn

   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom

   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1

   %inc = add nsw i32 %i.0, 1

   br label %for.cond, !llvm.loop !5

 }

===============================================

After simplifycfg:

===============================================

define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*

noalias %D, float* noalias %E) local_unnamed_addr #0 {

 entry:

   br label %for.cond

 for.cond:                                         ; preds = %for.body, %entry

   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]

   %cmp = icmp slt i32 %i.0, 4096

   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond

   ret void

 for.body:                                         ; preds = %for.cond

   %idxprom = sext i32 %i.0 to i64

   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom

   %0 = load float, float* %arrayidx, align 4, !tbaa !1

   %cmp1 = fcmp ogt float %0, 0.000000e+00

   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom

   %1 = load float, float* %arrayidx3, align 4, !tbaa !1

   %C.D = select i1 %cmp1, float* %C, float* %D

   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom

   %.pn = load float, float* %.pn.in, align 4, !tbaa !1

   %mul12.sink = fmul float %1, %.pn

   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom

   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1

   %inc = add nsw i32 %i.0, 1

   br label %for.cond, !llvm.loop !5

 }

===============================================

After looprotate and friends:

===============================================

define void @A(float* noalias nocapture %B, float* noalias nocapture readonly

%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,

float* noalias nocapture readonly %E) local_unnamed_addr #0 {

 entry:

   br label %for.body

 for.cond.cleanup:                                 ; preds = %for.body

   ret void

 for.body:                                         ; preds = %for.body, %entry

   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

   %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv

   %0 = load float, float* %arrayidx, align 4, !tbaa !1

   %cmp1 = fcmp ogt float %0, 0.000000e+00

   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv

   %1 = load float, float* %arrayidx3, align 4, !tbaa !1

   %C.D = select i1 %cmp1, float* %C, float* %D

   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv

   %.pn = load float, float* %.pn.in, align 4, !tbaa !1

   %mul12.sink = fmul float %1, %.pn

   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv

   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1

   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

   %exitcond = icmp eq i64 %indvars.iv.next, 4096

   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !5

 }

===============================================

And after Loop Vectorize we get (notice the llvm.masked.gather):

===============================================

define void @A(float* noalias nocapture %B, float* noalias nocapture readonly

%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,

float* noalias nocapture readonly %E) local_unnamed_addr #0 {

entry:

  br i1 false, label %scalar.ph, label %min.iters.checked

min.iters.checked:                                ; preds = %entry

  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %min.iters.checked

  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0

  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x

float*> undef, <16 x i32> zeroinitializer

  %broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32

0

  %broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,

<16 x float*> undef, <16 x i32> zeroinitializer

  br label %vector.body

vector.body:                                      ; preds = %vector.body,

%vector.ph

  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]

  %vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,

i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>,

%vector.ph ], [ %vec.ind.next, %vector.body ]

  %0 = extractelement <16 x i64> %vec.ind, i32 0

  %1 = getelementptr inbounds float, float* %A, i64 %0

  %2 = getelementptr float, float* %1, i32 0

  %3 = bitcast float* %2 to <16 x float>*

  %wide.load = load <16 x float>, <16 x float>* %3, align 4, !tbaa !1

  %4 = fcmp ogt <16 x float> %wide.load, zeroinitializer

  %5 = getelementptr inbounds float, float* %E, i64 %0

  %6 = getelementptr float, float* %5, i32 0

  %7 = bitcast float* %6 to <16 x float>*

  %wide.load26 = load <16 x float>, <16 x float>* %7, align 4, !tbaa !1

  %8 = extractelement <16 x i1> %4, i32 0

  %9 = select <16 x i1> %4, <16 x float*> %broadcast.splat, <16 x float*>

%broadcast.splat28

  %10 = extractelement <16 x float*> %9, i32 0

  %11 = getelementptr inbounds float, float* %10, i64 %0

  %12 = extractelement <16 x float*> %9, i32 1

  %13 = extractelement <16 x i64> %vec.ind, i32 1

  %14 = getelementptr inbounds float, float* %12, i64 %13

  %15 = extractelement <16 x float*> %9, i32 2

  %16 = extractelement <16 x i64> %vec.ind, i32 2

  %17 = getelementptr inbounds float, float* %15, i64 %16

  %18 = extractelement <16 x float*> %9, i32 3

  %19 = extractelement <16 x i64> %vec.ind, i32 3

  %20 = getelementptr inbounds float, float* %18, i64 %19

  %21 = extractelement <16 x float*> %9, i32 4

  %22 = extractelement <16 x i64> %vec.ind, i32 4

  %23 = getelementptr inbounds float, float* %21, i64 %22

  %24 = extractelement <16 x float*> %9, i32 5

  %25 = extractelement <16 x i64> %vec.ind, i32 5

  %26 = getelementptr inbounds float, float* %24, i64 %25

  %27 = extractelement <16 x float*> %9, i32 6

  %28 = extractelement <16 x i64> %vec.ind, i32 6

  %29 = getelementptr inbounds float, float* %27, i64 %28

  %30 = extractelement <16 x float*> %9, i32 7

  %31 = extractelement <16 x i64> %vec.ind, i32 7

  %32 = getelementptr inbounds float, float* %30, i64 %31

  %33 = extractelement <16 x float*> %9, i32 8

  %34 = extractelement <16 x i64> %vec.ind, i32 8

  %35 = getelementptr inbounds float, float* %33, i64 %34

  %36 = extractelement <16 x float*> %9, i32 9

  %37 = extractelement <16 x i64> %vec.ind, i32 9

  %38 = getelementptr inbounds float, float* %36, i64 %37

  %39 = extractelement <16 x float*> %9, i32 10

  %40 = extractelement <16 x i64> %vec.ind, i32 10

  %41 = getelementptr inbounds float, float* %39, i64 %40

  %42 = extractelement <16 x float*> %9, i32 11

  %43 = extractelement <16 x i64> %vec.ind, i32 11

  %44 = getelementptr inbounds float, float* %42, i64 %43

  %45 = extractelement <16 x float*> %9, i32 12

  %46 = extractelement <16 x i64> %vec.ind, i32 12

  %47 = getelementptr inbounds float, float* %45, i64 %46

  %48 = extractelement <16 x float*> %9, i32 13

  %49 = extractelement <16 x i64> %vec.ind, i32 13

  %50 = getelementptr inbounds float, float* %48, i64 %49

  %51 = extractelement <16 x float*> %9, i32 14

  %52 = extractelement <16 x i64> %vec.ind, i32 14

  %53 = getelementptr inbounds float, float* %51, i64 %52

  %54 = extractelement <16 x float*> %9, i32 15

  %55 = extractelement <16 x i64> %vec.ind, i32 15

  %56 = getelementptr inbounds float, float* %54, i64 %55

  %VectorGep = getelementptr inbounds float, <16 x float*> %9, <16 x i64>

%vec.ind

  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x

float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1

true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1

  %57 = fmul <16 x float> %wide.load26, %wide.masked.gather

  %58 = getelementptr inbounds float, float* %B, i64 %0

  %59 = getelementptr float, float* %58, i32 0

  %60 = bitcast float* %59 to <16 x float>*

  store <16 x float> %57, <16 x float>* %60, align 4, !tbaa !1

  %index.next = add i64 %index, 16

  %vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64

16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64

16, i64 16>

  %61 = icmp eq i64 %index.next, 4096

  br i1 %61, label %middle.block, label %vector.body, !llvm.loop !5

middle.block:                                     ; preds = %vector.body

  %cmp.n = icmp eq i64 4096, 4096

  br i1 %cmp.n, label %for.cond.cleanup, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block,

%min.iters.checked, %entry

  %bc.resume.val = phi i64 [ 4096, %middle.block ], [ 0, %entry ], [ 0,

%min.iters.checked ]

  br label %for.body

for.cond.cleanup:                                 ; preds = %middle.block,

%for.body

  ret void

for.body:                                         ; preds = %for.body,

%scalar.ph

  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,

%for.body ]

  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv

  %62 = load float, float* %arrayidx, align 4, !tbaa !1

  %cmp1 = fcmp ogt float %62, 0.000000e+00

  %arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv

  %63 = load float, float* %arrayidx3, align 4, !tbaa !1

  %C.D = select i1 %cmp1, float* %C, float* %D

  %.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv

  %.pn = load float, float* %.pn.in, align 4, !tbaa !1

  %mul12.sink = fmul float %63, %.pn

  %arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv

  store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

  %exitcond = icmp eq i64 %indvars.iv.next, 4096

  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10

}

===============================================

The final optimized IR is:

===============================================

define void @A(float* noalias nocapture %B, float* noalias nocapture readonly

%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,

float* noalias nocapture readonly %E) local_unnamed_addr #0 {

entry:

  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0

  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x

float*> undef, <16 x i32> zeroinitializer

  %broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32

0

  %broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,

<16 x float*> undef, <16 x i32> zeroinitializer

  br label %vector.body

vector.body:                                      ; preds = %vector.body,

%entry

  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]

  %vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,

i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %entry ],

[ %vec.ind.next, %vector.body ]

  %0 = extractelement <16 x i64> %vec.ind, i32 0

  %1 = getelementptr inbounds float, float* %A, i64 %0

  %2 = bitcast float* %1 to <16 x float>*

  %wide.load = load <16 x float>, <16 x float>* %2, align 4, !tbaa !1

  %3 = fcmp ogt <16 x float> %wide.load, zeroinitializer

  %4 = getelementptr inbounds float, float* %E, i64 %0

  %5 = bitcast float* %4 to <16 x float>*

  %wide.load26 = load <16 x float>, <16 x float>* %5, align 4, !tbaa !1

  %6 = select <16 x i1> %3, <16 x float*> %broadcast.splat, <16 x float*>

%broadcast.splat28

  %VectorGep = getelementptr inbounds float, <16 x float*> %6, <16 x i64>

%vec.ind

  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x

float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1

true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1

  %7 = fmul <16 x float> %wide.load26, %wide.masked.gather

  %8 = getelementptr inbounds float, float* %B, i64 %0

  %9 = bitcast float* %8 to <16 x float>*

  store <16 x float> %7, <16 x float>* %9, align 4, !tbaa !1

  %index.next = add i64 %index, 16

  %vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64

16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64

16, i64 16>

  %10 = icmp eq i64 %index.next, 4096

  br i1 %10, label %for.cond.cleanup, label %vector.body, !llvm.loop !5

for.cond.cleanup:                                 ; preds = %vector.body

  ret void

}

===============================================</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>