<html>
<head>
<base href="https://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - if-conversion+vectorization leads to masked.gather instead of two masked.load's"
href="https://llvm.org/bugs/show_bug.cgi?id=30786">30786</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>if-conversion+vectorization leads to masked.gather instead of two masked.load's
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>All
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Loop Optimizer
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>zvi.rackover@intel.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>Consider the following 'hello, world' for if-conversion+vectorization compiled
with clang -O3 -S -mllvm -march=skylake-avx512:
===============================================
void foo(float *restrict B, const float *restrict A, const float *restrict C,
const float *restrict D, const float *restrict E) {
#pragma clang loop vectorize(enable)
#pragma clang loop unroll(disable)
for (int i = 0; i < 4096; i++) {
if (A[i] > 0) {
B[i] = (E[i] * C[i]);
} else {
B[i] = (E[i] * D[i]);
}
}
}
===============================================
The main issue shown below by the dumps after some of the interesting
transformations is that we are generating a
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
instead of two llvm.masked.load's (with complementary masks) which is more
efficient on X86. Just stating the obvious that the accesses to C[i] and D[i]
are contiguous over i, so the masked.load's are appropriate.
The vectorizer sees the IR after if-conversion happened earlier:
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
and will vectorize the load to a gather.
Following are the dumps after some transformations.
Before instcombine:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
entry:
br label %for.cond
for.cond: ; preds = %if.end, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
br i1 %cmp1, label %if.then, label %if.else
if.then: ; preds = %for.body
%arrayidx5 = getelementptr inbounds float, float* %C, i64 %idxprom
%2 = load float, float* %arrayidx5, align 4, !tbaa !1
%mul = fmul float %1, %2
br label %if.end
if.else: ; preds = %for.body
%arrayidx11 = getelementptr inbounds float, float* %D, i64 %idxprom
%3 = load float, float* %arrayidx11, align 4, !tbaa !1
%mul12 = fmul float %1, %3
br label %if.end
if.end: ; preds = %if.else, %if.then
%mul12.sink = phi float [ %mul12, %if.else ], [ %mul, %if.then ]
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
}
===============================================
After instcombine:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
entry:
br label %for.cond
for.cond: ; preds = %if.end, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
br i1 %cmp1, label %if.then, label %if.else
if.then: ; preds = %for.body
br label %if.end
if.else: ; preds = %for.body
br label %if.end
if.end: ; preds = %if.else, %if.then
%D.pn = phi float* [ %D, %if.else ], [ %C, %if.then ]
%.pn.in = getelementptr inbounds float, float* %D.pn, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
}
===============================================
After simplifycfg:
===============================================
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
entry:
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
}
===============================================
After looprotate and friends:
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 4096
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !5
}
===============================================
And after Loop Vectorize we get (notice the llvm.masked.gather):
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
entry:
br i1 false, label %scalar.ph, label %min.iters.checked
min.iters.checked: ; preds = %entry
br i1 false, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %min.iters.checked
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
%broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
0
%broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body,
%vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>,
%vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = extractelement <16 x i64> %vec.ind, i32 0
%1 = getelementptr inbounds float, float* %A, i64 %0
%2 = getelementptr float, float* %1, i32 0
%3 = bitcast float* %2 to <16 x float>*
%wide.load = load <16 x float>, <16 x float>* %3, align 4, !tbaa !1
%4 = fcmp ogt <16 x float> %wide.load, zeroinitializer
%5 = getelementptr inbounds float, float* %E, i64 %0
%6 = getelementptr float, float* %5, i32 0
%7 = bitcast float* %6 to <16 x float>*
%wide.load26 = load <16 x float>, <16 x float>* %7, align 4, !tbaa !1
%8 = extractelement <16 x i1> %4, i32 0
%9 = select <16 x i1> %4, <16 x float*> %broadcast.splat, <16 x float*>
%broadcast.splat28
%10 = extractelement <16 x float*> %9, i32 0
%11 = getelementptr inbounds float, float* %10, i64 %0
%12 = extractelement <16 x float*> %9, i32 1
%13 = extractelement <16 x i64> %vec.ind, i32 1
%14 = getelementptr inbounds float, float* %12, i64 %13
%15 = extractelement <16 x float*> %9, i32 2
%16 = extractelement <16 x i64> %vec.ind, i32 2
%17 = getelementptr inbounds float, float* %15, i64 %16
%18 = extractelement <16 x float*> %9, i32 3
%19 = extractelement <16 x i64> %vec.ind, i32 3
%20 = getelementptr inbounds float, float* %18, i64 %19
%21 = extractelement <16 x float*> %9, i32 4
%22 = extractelement <16 x i64> %vec.ind, i32 4
%23 = getelementptr inbounds float, float* %21, i64 %22
%24 = extractelement <16 x float*> %9, i32 5
%25 = extractelement <16 x i64> %vec.ind, i32 5
%26 = getelementptr inbounds float, float* %24, i64 %25
%27 = extractelement <16 x float*> %9, i32 6
%28 = extractelement <16 x i64> %vec.ind, i32 6
%29 = getelementptr inbounds float, float* %27, i64 %28
%30 = extractelement <16 x float*> %9, i32 7
%31 = extractelement <16 x i64> %vec.ind, i32 7
%32 = getelementptr inbounds float, float* %30, i64 %31
%33 = extractelement <16 x float*> %9, i32 8
%34 = extractelement <16 x i64> %vec.ind, i32 8
%35 = getelementptr inbounds float, float* %33, i64 %34
%36 = extractelement <16 x float*> %9, i32 9
%37 = extractelement <16 x i64> %vec.ind, i32 9
%38 = getelementptr inbounds float, float* %36, i64 %37
%39 = extractelement <16 x float*> %9, i32 10
%40 = extractelement <16 x i64> %vec.ind, i32 10
%41 = getelementptr inbounds float, float* %39, i64 %40
%42 = extractelement <16 x float*> %9, i32 11
%43 = extractelement <16 x i64> %vec.ind, i32 11
%44 = getelementptr inbounds float, float* %42, i64 %43
%45 = extractelement <16 x float*> %9, i32 12
%46 = extractelement <16 x i64> %vec.ind, i32 12
%47 = getelementptr inbounds float, float* %45, i64 %46
%48 = extractelement <16 x float*> %9, i32 13
%49 = extractelement <16 x i64> %vec.ind, i32 13
%50 = getelementptr inbounds float, float* %48, i64 %49
%51 = extractelement <16 x float*> %9, i32 14
%52 = extractelement <16 x i64> %vec.ind, i32 14
%53 = getelementptr inbounds float, float* %51, i64 %52
%54 = extractelement <16 x float*> %9, i32 15
%55 = extractelement <16 x i64> %vec.ind, i32 15
%56 = getelementptr inbounds float, float* %54, i64 %55
%VectorGep = getelementptr inbounds float, <16 x float*> %9, <16 x i64>
%vec.ind
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
%57 = fmul <16 x float> %wide.load26, %wide.masked.gather
%58 = getelementptr inbounds float, float* %B, i64 %0
%59 = getelementptr float, float* %58, i32 0
%60 = bitcast float* %59 to <16 x float>*
store <16 x float> %57, <16 x float>* %60, align 4, !tbaa !1
%index.next = add i64 %index, 16
%vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
%61 = icmp eq i64 %index.next, 4096
br i1 %61, label %middle.block, label %vector.body, !llvm.loop !5
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 4096, 4096
br i1 %cmp.n, label %for.cond.cleanup, label %scalar.ph
scalar.ph: ; preds = %middle.block,
%min.iters.checked, %entry
%bc.resume.val = phi i64 [ 4096, %middle.block ], [ 0, %entry ], [ 0,
%min.iters.checked ]
br label %for.body
for.cond.cleanup: ; preds = %middle.block,
%for.body
ret void
for.body: ; preds = %for.body,
%scalar.ph
%indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,
%for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
%62 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %62, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
%63 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %63, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 4096
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
}
===============================================
The final optimized IR is:
===============================================
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
entry:
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
%broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
0
%broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body,
%entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %entry ],
[ %vec.ind.next, %vector.body ]
%0 = extractelement <16 x i64> %vec.ind, i32 0
%1 = getelementptr inbounds float, float* %A, i64 %0
%2 = bitcast float* %1 to <16 x float>*
%wide.load = load <16 x float>, <16 x float>* %2, align 4, !tbaa !1
%3 = fcmp ogt <16 x float> %wide.load, zeroinitializer
%4 = getelementptr inbounds float, float* %E, i64 %0
%5 = bitcast float* %4 to <16 x float>*
%wide.load26 = load <16 x float>, <16 x float>* %5, align 4, !tbaa !1
%6 = select <16 x i1> %3, <16 x float*> %broadcast.splat, <16 x float*>
%broadcast.splat28
%VectorGep = getelementptr inbounds float, <16 x float*> %6, <16 x i64>
%vec.ind
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
%7 = fmul <16 x float> %wide.load26, %wide.masked.gather
%8 = getelementptr inbounds float, float* %B, i64 %0
%9 = bitcast float* %8 to <16 x float>*
store <16 x float> %7, <16 x float>* %9, align 4, !tbaa !1
%index.next = add i64 %index, 16
%vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
%10 = icmp eq i64 %index.next, 4096
br i1 %10, label %for.cond.cleanup, label %vector.body, !llvm.loop !5
for.cond.cleanup: ; preds = %vector.body
ret void
}
===============================================</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>