[llvm-bugs] [Bug 30786] New: if-conversion+vectorization leads to masked.gather instead of two masked.load's

Tue Oct 25 09:43:59 PDT 2016

https://llvm.org/bugs/show_bug.cgi?id=30786

            Bug ID: 30786
           Summary: if-conversion+vectorization leads to masked.gather
                    instead of  two masked.load's
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: zvi.rackover at intel.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider the following 'hello, world' for if-conversion+vectorization compiled
with clang -O3 -S -mllvm -march=skylake-avx512:
===============================================
void foo(float *restrict B, const float *restrict A, const float *restrict C,
          const float *restrict D, const float *restrict E) {
 #pragma clang loop vectorize(enable)
 #pragma clang loop unroll(disable)
   for (int i = 0; i < 4096; i++) {
     if (A[i] > 0) {
       B[i] = (E[i] * C[i]);
     } else {
       B[i] = (E[i] * D[i]);
     }
   }
 }

===============================================

The main issue shown below by the dumps after some of the interesting
transformations is that we are generating a
  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1

instead of two llvm.masked.load's (with complementary masks) which is more
efficient on X86. Just stating the obvious that the accesses to C[i] and D[i]
are contiguous over i, so the masked.load's are appropriate.

The vectorizer sees the IR after if-conversion happened earlier:
   %C.D = select i1 %cmp1, float* %C, float* %D
   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
   %.pn = load float, float* %.pn.in, align 4, !tbaa !1
and will vectorize the load to a gather.

Following are the dumps after some transformations.

Before instcombine:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
 entry:
   br label %for.cond

 for.cond:                                         ; preds = %if.end, %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
   %cmp = icmp slt i32 %i.0, 4096
   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond
   ret void

 for.body:                                         ; preds = %for.cond
   %idxprom = sext i32 %i.0 to i64
   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
   %0 = load float, float* %arrayidx, align 4, !tbaa !1
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
   %1 = load float, float* %arrayidx3, align 4, !tbaa !1
   br i1 %cmp1, label %if.then, label %if.else

 if.then:                                          ; preds = %for.body
   %arrayidx5 = getelementptr inbounds float, float* %C, i64 %idxprom
   %2 = load float, float* %arrayidx5, align 4, !tbaa !1
   %mul = fmul float %1, %2
   br label %if.end

 if.else:                                          ; preds = %for.body
   %arrayidx11 = getelementptr inbounds float, float* %D, i64 %idxprom
   %3 = load float, float* %arrayidx11, align 4, !tbaa !1
   %mul12 = fmul float %1, %3
   br label %if.end

 if.end:                                           ; preds = %if.else, %if.then
   %mul12.sink = phi float [ %mul12, %if.else ], [ %mul, %if.then ]
   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
   %inc = add nsw i32 %i.0, 1
   br label %for.cond, !llvm.loop !5
 }
===============================================

After instcombine:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
 entry:
   br label %for.cond

 for.cond:                                         ; preds = %if.end, %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
   %cmp = icmp slt i32 %i.0, 4096
   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond
   ret void

 for.body:                                         ; preds = %for.cond
   %idxprom = sext i32 %i.0 to i64
   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
   %0 = load float, float* %arrayidx, align 4, !tbaa !1
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
   %1 = load float, float* %arrayidx3, align 4, !tbaa !1
   br i1 %cmp1, label %if.then, label %if.else

 if.then:                                          ; preds = %for.body
   br label %if.end

 if.else:                                          ; preds = %for.body
   br label %if.end

 if.end:                                           ; preds = %if.else, %if.then
   %D.pn = phi float* [ %D, %if.else ], [ %C, %if.then ]
   %.pn.in = getelementptr inbounds float, float* %D.pn, i64 %idxprom
   %.pn = load float, float* %.pn.in, align 4, !tbaa !1
   %mul12.sink = fmul float %1, %.pn
   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
   %inc = add nsw i32 %i.0, 1
   br label %for.cond, !llvm.loop !5
 }
===============================================

After simplifycfg:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
 entry:
   br label %for.cond

 for.cond:                                         ; preds = %for.body, %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %cmp = icmp slt i32 %i.0, 4096
   br i1 %cmp, label %for.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %for.cond
   ret void

 for.body:                                         ; preds = %for.cond
   %idxprom = sext i32 %i.0 to i64
   %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
   %0 = load float, float* %arrayidx, align 4, !tbaa !1
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
   %1 = load float, float* %arrayidx3, align 4, !tbaa !1
   %C.D = select i1 %cmp1, float* %C, float* %D
   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
   %.pn = load float, float* %.pn.in, align 4, !tbaa !1
   %mul12.sink = fmul float %1, %.pn
   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
   %inc = add nsw i32 %i.0, 1
   br label %for.cond, !llvm.loop !5
 }
===============================================

After looprotate and friends:
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
 entry:
   br label %for.body

 for.cond.cleanup:                                 ; preds = %for.body
   ret void

 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
   %0 = load float, float* %arrayidx, align 4, !tbaa !1
   %cmp1 = fcmp ogt float %0, 0.000000e+00
   %arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
   %1 = load float, float* %arrayidx3, align 4, !tbaa !1
   %C.D = select i1 %cmp1, float* %C, float* %D
   %.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
   %.pn = load float, float* %.pn.in, align 4, !tbaa !1
   %mul12.sink = fmul float %1, %.pn
   %arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
   store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 4096
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !5
 }
===============================================

And after Loop Vectorize we get (notice the llvm.masked.gather):
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
entry:
  br i1 false, label %scalar.ph, label %min.iters.checked

min.iters.checked:                                ; preds = %entry
  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %min.iters.checked
  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
  %broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
0
  %broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body,
%vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>,
%vector.ph ], [ %vec.ind.next, %vector.body ]
  %0 = extractelement <16 x i64> %vec.ind, i32 0
  %1 = getelementptr inbounds float, float* %A, i64 %0
  %2 = getelementptr float, float* %1, i32 0
  %3 = bitcast float* %2 to <16 x float>*
  %wide.load = load <16 x float>, <16 x float>* %3, align 4, !tbaa !1
  %4 = fcmp ogt <16 x float> %wide.load, zeroinitializer
  %5 = getelementptr inbounds float, float* %E, i64 %0
  %6 = getelementptr float, float* %5, i32 0
  %7 = bitcast float* %6 to <16 x float>*
  %wide.load26 = load <16 x float>, <16 x float>* %7, align 4, !tbaa !1
  %8 = extractelement <16 x i1> %4, i32 0
  %9 = select <16 x i1> %4, <16 x float*> %broadcast.splat, <16 x float*>
%broadcast.splat28
  %10 = extractelement <16 x float*> %9, i32 0
  %11 = getelementptr inbounds float, float* %10, i64 %0
  %12 = extractelement <16 x float*> %9, i32 1
  %13 = extractelement <16 x i64> %vec.ind, i32 1
  %14 = getelementptr inbounds float, float* %12, i64 %13
  %15 = extractelement <16 x float*> %9, i32 2
  %16 = extractelement <16 x i64> %vec.ind, i32 2
  %17 = getelementptr inbounds float, float* %15, i64 %16
  %18 = extractelement <16 x float*> %9, i32 3
  %19 = extractelement <16 x i64> %vec.ind, i32 3
  %20 = getelementptr inbounds float, float* %18, i64 %19
  %21 = extractelement <16 x float*> %9, i32 4
  %22 = extractelement <16 x i64> %vec.ind, i32 4
  %23 = getelementptr inbounds float, float* %21, i64 %22
  %24 = extractelement <16 x float*> %9, i32 5
  %25 = extractelement <16 x i64> %vec.ind, i32 5
  %26 = getelementptr inbounds float, float* %24, i64 %25
  %27 = extractelement <16 x float*> %9, i32 6
  %28 = extractelement <16 x i64> %vec.ind, i32 6
  %29 = getelementptr inbounds float, float* %27, i64 %28
  %30 = extractelement <16 x float*> %9, i32 7
  %31 = extractelement <16 x i64> %vec.ind, i32 7
  %32 = getelementptr inbounds float, float* %30, i64 %31
  %33 = extractelement <16 x float*> %9, i32 8
  %34 = extractelement <16 x i64> %vec.ind, i32 8
  %35 = getelementptr inbounds float, float* %33, i64 %34
  %36 = extractelement <16 x float*> %9, i32 9
  %37 = extractelement <16 x i64> %vec.ind, i32 9
  %38 = getelementptr inbounds float, float* %36, i64 %37
  %39 = extractelement <16 x float*> %9, i32 10
  %40 = extractelement <16 x i64> %vec.ind, i32 10
  %41 = getelementptr inbounds float, float* %39, i64 %40
  %42 = extractelement <16 x float*> %9, i32 11
  %43 = extractelement <16 x i64> %vec.ind, i32 11
  %44 = getelementptr inbounds float, float* %42, i64 %43
  %45 = extractelement <16 x float*> %9, i32 12
  %46 = extractelement <16 x i64> %vec.ind, i32 12
  %47 = getelementptr inbounds float, float* %45, i64 %46
  %48 = extractelement <16 x float*> %9, i32 13
  %49 = extractelement <16 x i64> %vec.ind, i32 13
  %50 = getelementptr inbounds float, float* %48, i64 %49
  %51 = extractelement <16 x float*> %9, i32 14
  %52 = extractelement <16 x i64> %vec.ind, i32 14
  %53 = getelementptr inbounds float, float* %51, i64 %52
  %54 = extractelement <16 x float*> %9, i32 15
  %55 = extractelement <16 x i64> %vec.ind, i32 15
  %56 = getelementptr inbounds float, float* %54, i64 %55
  %VectorGep = getelementptr inbounds float, <16 x float*> %9, <16 x i64>
%vec.ind
  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
  %57 = fmul <16 x float> %wide.load26, %wide.masked.gather
  %58 = getelementptr inbounds float, float* %B, i64 %0
  %59 = getelementptr float, float* %58, i32 0
  %60 = bitcast float* %59 to <16 x float>*
  store <16 x float> %57, <16 x float>* %60, align 4, !tbaa !1
  %index.next = add i64 %index, 16
  %vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
  %61 = icmp eq i64 %index.next, 4096
  br i1 %61, label %middle.block, label %vector.body, !llvm.loop !5

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 4096, 4096
  br i1 %cmp.n, label %for.cond.cleanup, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block,
%min.iters.checked, %entry
  %bc.resume.val = phi i64 [ 4096, %middle.block ], [ 0, %entry ], [ 0,
%min.iters.checked ]
  br label %for.body

for.cond.cleanup:                                 ; preds = %middle.block,
%for.body
  ret void

for.body:                                         ; preds = %for.body,
%scalar.ph
  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,
%for.body ]
  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
  %62 = load float, float* %arrayidx, align 4, !tbaa !1
  %cmp1 = fcmp ogt float %62, 0.000000e+00
  %arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
  %63 = load float, float* %arrayidx3, align 4, !tbaa !1
  %C.D = select i1 %cmp1, float* %C, float* %D
  %.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
  %.pn = load float, float* %.pn.in, align 4, !tbaa !1
  %mul12.sink = fmul float %63, %.pn
  %arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
  store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp eq i64 %indvars.iv.next, 4096
  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
}
===============================================

The final optimized IR is:
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
entry:
  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
  %broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
0
  %broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body,
%entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %entry ],
[ %vec.ind.next, %vector.body ]
  %0 = extractelement <16 x i64> %vec.ind, i32 0
  %1 = getelementptr inbounds float, float* %A, i64 %0
  %2 = bitcast float* %1 to <16 x float>*
  %wide.load = load <16 x float>, <16 x float>* %2, align 4, !tbaa !1
  %3 = fcmp ogt <16 x float> %wide.load, zeroinitializer
  %4 = getelementptr inbounds float, float* %E, i64 %0
  %5 = bitcast float* %4 to <16 x float>*
  %wide.load26 = load <16 x float>, <16 x float>* %5, align 4, !tbaa !1
  %6 = select <16 x i1> %3, <16 x float*> %broadcast.splat, <16 x float*>
%broadcast.splat28
  %VectorGep = getelementptr inbounds float, <16 x float*> %6, <16 x i64>
%vec.ind
  %wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
  %7 = fmul <16 x float> %wide.load26, %wide.masked.gather
  %8 = getelementptr inbounds float, float* %B, i64 %0
  %9 = bitcast float* %8 to <16 x float>*
  store <16 x float> %7, <16 x float>* %9, align 4, !tbaa !1
  %index.next = add i64 %index, 16
  %vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
  %10 = icmp eq i64 %index.next, 4096
  br i1 %10, label %for.cond.cleanup, label %vector.body, !llvm.loop !5

for.cond.cleanup:                                 ; preds = %vector.body
  ret void
}
===============================================

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161025/2eefbb2b/attachment-0001.html>