[llvm-bugs] [Bug 30786] New: if-conversion+vectorization leads to masked.gather instead of two masked.load's
via llvm-bugs
llvm-bugs at lists.llvm.org
Tue Oct 25 09:43:59 PDT 2016
Bug ID: 30786
Summary: if-conversion+vectorization leads to masked.gather
instead of two masked.load's
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: normal
Priority: P
Component: Loop Optimizer
Assignee: unassignedbugs at nondot.org
Reporter: zvi.rackover at intel.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Consider the following 'hello, world' for if-conversion+vectorization compiled
with clang -O3 -S -mllvm -march=skylake-avx512:
void foo(float *restrict B, const float *restrict A, const float *restrict C,
const float *restrict D, const float *restrict E) {
#pragma clang loop vectorize(enable)
#pragma clang loop unroll(disable)
for (int i = 0; i < 4096; i++) {
if (A[i] > 0) {
B[i] = (E[i] * C[i]);
} else {
B[i] = (E[i] * D[i]);
The main issue shown below by the dumps after some of the interesting
transformations is that we are generating a
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
instead of two llvm.masked.load's (with complementary masks) which is more
efficient on X86. Just stating the obvious that the accesses to C[i] and D[i]
are contiguous over i, so the masked.load's are appropriate.
The vectorizer sees the IR after if-conversion happened earlier:
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
and will vectorize the load to a gather.
Following are the dumps after some transformations.
Before instcombine:
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
br label %for.cond
for.cond: ; preds = %if.end, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
br i1 %cmp1, label %if.then, label %if.else
if.then: ; preds = %for.body
%arrayidx5 = getelementptr inbounds float, float* %C, i64 %idxprom
%2 = load float, float* %arrayidx5, align 4, !tbaa !1
%mul = fmul float %1, %2
br label %if.end
if.else: ; preds = %for.body
%arrayidx11 = getelementptr inbounds float, float* %D, i64 %idxprom
%3 = load float, float* %arrayidx11, align 4, !tbaa !1
%mul12 = fmul float %1, %3
br label %if.end
if.end: ; preds = %if.else, %if.then
%mul12.sink = phi float [ %mul12, %if.else ], [ %mul, %if.then ]
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
After instcombine:
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
br label %for.cond
for.cond: ; preds = %if.end, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
br i1 %cmp1, label %if.then, label %if.else
if.then: ; preds = %for.body
br label %if.end
if.else: ; preds = %for.body
br label %if.end
if.end: ; preds = %if.else, %if.then
%D.pn = phi float* [ %D, %if.else ], [ %C, %if.then ]
%.pn.in = getelementptr inbounds float, float* %D.pn, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
After simplifycfg:
define void @A(float* noalias %B, float* noalias %A, float* noalias %C, float*
noalias %D, float* noalias %E) local_unnamed_addr #0 {
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 4096
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %idxprom
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %idxprom
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %idxprom
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !5
After looprotate and friends:
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %0, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
%1 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %1, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 4096
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !5
And after Loop Vectorize we get (notice the llvm.masked.gather):
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
br i1 false, label %scalar.ph, label %min.iters.checked
min.iters.checked: ; preds = %entry
br i1 false, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %min.iters.checked
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
%broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
%broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body,
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>,
%vector.ph ], [ %vec.ind.next, %vector.body ]
%0 = extractelement <16 x i64> %vec.ind, i32 0
%1 = getelementptr inbounds float, float* %A, i64 %0
%2 = getelementptr float, float* %1, i32 0
%3 = bitcast float* %2 to <16 x float>*
%wide.load = load <16 x float>, <16 x float>* %3, align 4, !tbaa !1
%4 = fcmp ogt <16 x float> %wide.load, zeroinitializer
%5 = getelementptr inbounds float, float* %E, i64 %0
%6 = getelementptr float, float* %5, i32 0
%7 = bitcast float* %6 to <16 x float>*
%wide.load26 = load <16 x float>, <16 x float>* %7, align 4, !tbaa !1
%8 = extractelement <16 x i1> %4, i32 0
%9 = select <16 x i1> %4, <16 x float*> %broadcast.splat, <16 x float*>
%10 = extractelement <16 x float*> %9, i32 0
%11 = getelementptr inbounds float, float* %10, i64 %0
%12 = extractelement <16 x float*> %9, i32 1
%13 = extractelement <16 x i64> %vec.ind, i32 1
%14 = getelementptr inbounds float, float* %12, i64 %13
%15 = extractelement <16 x float*> %9, i32 2
%16 = extractelement <16 x i64> %vec.ind, i32 2
%17 = getelementptr inbounds float, float* %15, i64 %16
%18 = extractelement <16 x float*> %9, i32 3
%19 = extractelement <16 x i64> %vec.ind, i32 3
%20 = getelementptr inbounds float, float* %18, i64 %19
%21 = extractelement <16 x float*> %9, i32 4
%22 = extractelement <16 x i64> %vec.ind, i32 4
%23 = getelementptr inbounds float, float* %21, i64 %22
%24 = extractelement <16 x float*> %9, i32 5
%25 = extractelement <16 x i64> %vec.ind, i32 5
%26 = getelementptr inbounds float, float* %24, i64 %25
%27 = extractelement <16 x float*> %9, i32 6
%28 = extractelement <16 x i64> %vec.ind, i32 6
%29 = getelementptr inbounds float, float* %27, i64 %28
%30 = extractelement <16 x float*> %9, i32 7
%31 = extractelement <16 x i64> %vec.ind, i32 7
%32 = getelementptr inbounds float, float* %30, i64 %31
%33 = extractelement <16 x float*> %9, i32 8
%34 = extractelement <16 x i64> %vec.ind, i32 8
%35 = getelementptr inbounds float, float* %33, i64 %34
%36 = extractelement <16 x float*> %9, i32 9
%37 = extractelement <16 x i64> %vec.ind, i32 9
%38 = getelementptr inbounds float, float* %36, i64 %37
%39 = extractelement <16 x float*> %9, i32 10
%40 = extractelement <16 x i64> %vec.ind, i32 10
%41 = getelementptr inbounds float, float* %39, i64 %40
%42 = extractelement <16 x float*> %9, i32 11
%43 = extractelement <16 x i64> %vec.ind, i32 11
%44 = getelementptr inbounds float, float* %42, i64 %43
%45 = extractelement <16 x float*> %9, i32 12
%46 = extractelement <16 x i64> %vec.ind, i32 12
%47 = getelementptr inbounds float, float* %45, i64 %46
%48 = extractelement <16 x float*> %9, i32 13
%49 = extractelement <16 x i64> %vec.ind, i32 13
%50 = getelementptr inbounds float, float* %48, i64 %49
%51 = extractelement <16 x float*> %9, i32 14
%52 = extractelement <16 x i64> %vec.ind, i32 14
%53 = getelementptr inbounds float, float* %51, i64 %52
%54 = extractelement <16 x float*> %9, i32 15
%55 = extractelement <16 x i64> %vec.ind, i32 15
%56 = getelementptr inbounds float, float* %54, i64 %55
%VectorGep = getelementptr inbounds float, <16 x float*> %9, <16 x i64>
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
%57 = fmul <16 x float> %wide.load26, %wide.masked.gather
%58 = getelementptr inbounds float, float* %B, i64 %0
%59 = getelementptr float, float* %58, i32 0
%60 = bitcast float* %59 to <16 x float>*
store <16 x float> %57, <16 x float>* %60, align 4, !tbaa !1
%index.next = add i64 %index, 16
%vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
%61 = icmp eq i64 %index.next, 4096
br i1 %61, label %middle.block, label %vector.body, !llvm.loop !5
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 4096, 4096
br i1 %cmp.n, label %for.cond.cleanup, label %scalar.ph
scalar.ph: ; preds = %middle.block,
%min.iters.checked, %entry
%bc.resume.val = phi i64 [ 4096, %middle.block ], [ 0, %entry ], [ 0,
%min.iters.checked ]
br label %for.body
for.cond.cleanup: ; preds = %middle.block,
ret void
for.body: ; preds = %for.body,
%indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next,
%for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
%62 = load float, float* %arrayidx, align 4, !tbaa !1
%cmp1 = fcmp ogt float %62, 0.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %E, i64 %indvars.iv
%63 = load float, float* %arrayidx3, align 4, !tbaa !1
%C.D = select i1 %cmp1, float* %C, float* %D
%.pn.in = getelementptr inbounds float, float* %C.D, i64 %indvars.iv
%.pn = load float, float* %.pn.in, align 4, !tbaa !1
%mul12.sink = fmul float %63, %.pn
%arrayidx14 = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %mul12.sink, float* %arrayidx14, align 4, !tbaa !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 4096
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
The final optimized IR is:
define void @A(float* noalias nocapture %B, float* noalias nocapture readonly
%A, float* noalias nocapture readonly %C, float* noalias nocapture readonly %D,
float* noalias nocapture readonly %E) local_unnamed_addr #0 {
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %C, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x
float*> undef, <16 x i32> zeroinitializer
%broadcast.splatinsert27 = insertelement <16 x float*> undef, float* %D, i32
%broadcast.splat28 = shufflevector <16 x float*> %broadcast.splatinsert27,
<16 x float*> undef, <16 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body,
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%vec.ind = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %entry ],
[ %vec.ind.next, %vector.body ]
%0 = extractelement <16 x i64> %vec.ind, i32 0
%1 = getelementptr inbounds float, float* %A, i64 %0
%2 = bitcast float* %1 to <16 x float>*
%wide.load = load <16 x float>, <16 x float>* %2, align 4, !tbaa !1
%3 = fcmp ogt <16 x float> %wide.load, zeroinitializer
%4 = getelementptr inbounds float, float* %E, i64 %0
%5 = bitcast float* %4 to <16 x float>*
%wide.load26 = load <16 x float>, <16 x float>* %5, align 4, !tbaa !1
%6 = select <16 x i1> %3, <16 x float*> %broadcast.splat, <16 x float*>
%VectorGep = getelementptr inbounds float, <16 x float*> %6, <16 x i64>
%wide.masked.gather = call <16 x float> @llvm.masked.gather.v16f32(<16 x
float*> %VectorGep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true>, <16 x float> undef), !tbaa !1
%7 = fmul <16 x float> %wide.load26, %wide.masked.gather
%8 = getelementptr inbounds float, float* %B, i64 %0
%9 = bitcast float* %8 to <16 x float>*
store <16 x float> %7, <16 x float>* %9, align 4, !tbaa !1
%index.next = add i64 %index, 16
%vec.ind.next = add <16 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64
16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64
16, i64 16>
%10 = icmp eq i64 %index.next, 4096
br i1 %10, label %for.cond.cleanup, label %vector.body, !llvm.loop !5
for.cond.cleanup: ; preds = %vector.body
ret void
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161025/2eefbb2b/attachment-0001.html>
More information about the llvm-bugs
mailing list