[PATCH] D141590: [PassManager] Add some passes to the sequence of extra vector passes
Tiehu Zhang via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 12 04:14:54 PST 2023
TiehuZhang added a comment.
In D141590#4046933 <https://reviews.llvm.org/D141590#4046933>, @fhahn wrote:
> Could you share a full IR test showing the improvements? The main goal of the extra vectorizer passes is to optimize the vector code, not necessarily other scalar loops.
Hi, @fhahn , this case is a nestloop. The inner loop has been SVE vectorized. But there is a special scenario, that is the tripcount of the inner loop `max_ row_ If length` is 0, then the outerloop will degenerate into a memset operation as shown in the `for.cond1.preheader`. Without this patch, the outer loop will execute a scalar loop (`for.cond1.preheader`). With the patch, this loop will be optimized to @llvm.memset.p0.i64 (shown in `for.cond1.preheader.preheader.split:`), which is more efficient.
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
; Function Attrs: nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16)
define dso_local void @testNestLoop(i32 noundef %nCells, ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %b, ptr nocapture noundef readonly %values, i32 noundef %max_row_length) local_unnamed_addr #0 {
entry:
%cmp26 = icmp sgt i32 %nCells, 0
br i1 %cmp26, label %for.cond1.preheader.preheader, label %for.cond.cleanup
for.cond1.preheader.preheader: ; preds = %entry
%0 = sext i32 %max_row_length to i64
%wide.trip.count = zext i32 %nCells to i64
%cmp222 = icmp sgt i32 %max_row_length, 0
br i1 %cmp222, label %for.cond1.preheader.preheader.split.us, label %for.cond1.preheader
for.cond1.preheader.preheader.split.us: ; preds = %for.cond1.preheader.preheader
%1 = tail call i64 @llvm.vscale.i64()
%2 = shl nuw nsw i64 %1, 2
br label %for.cond1.preheader.us
for.cond1.preheader.us: ; preds = %for.cond.cleanup3.us, %for.cond1.preheader.preheader.split.us
%indvars.iv32.us = phi i64 [ 0, %for.cond1.preheader.preheader.split.us ], [ %indvars.iv.next33.us, %for.cond.cleanup3.us ]
%indvars.iv.us = phi i64 [ 0, %for.cond1.preheader.preheader.split.us ], [ %3, %for.cond.cleanup3.us ]
%3 = add i64 %indvars.iv.us, %0
%4 = add i64 %indvars.iv.us, 1
%smax.us = tail call i64 @llvm.smax.i64(i64 %3, i64 %4)
%5 = mul i64 %indvars.iv32.us, %0
%6 = sub i64 %smax.us, %5
%min.iters.check.us = icmp ult i64 %6, %2
br i1 %min.iters.check.us, label %for.body4.us.preheader, label %vector.ph.us
vector.ph.us: ; preds = %for.cond1.preheader.us
%n.mod.vf.us = urem i64 %6, %2
%n.vec.us = sub i64 %6, %n.mod.vf.us
%ind.end.us = add i64 %indvars.iv.us, %n.vec.us
%7 = tail call i32 @llvm.vscale.i32()
%8 = shl nuw nsw i32 %7, 1
%9 = zext i32 %8 to i64
br label %vector.body.us
vector.body.us: ; preds = %vector.body.us, %vector.ph.us
%index.us = phi i64 [ 0, %vector.ph.us ], [ %index.next.us, %vector.body.us ]
%vec.phi.us = phi double [ 0.000000e+00, %vector.ph.us ], [ %17, %vector.body.us ]
%offset.idx.us = add i64 %indvars.iv.us, %index.us
%10 = getelementptr inbounds double, ptr %values, i64 %offset.idx.us
%wide.load.us = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6
%11 = getelementptr inbounds double, ptr %10, i64 %9
%wide.load37.us = load <vscale x 2 x double>, ptr %11, align 8, !tbaa !6
%12 = getelementptr inbounds double, ptr %x, i64 %offset.idx.us
%wide.load38.us = load <vscale x 2 x double>, ptr %12, align 8, !tbaa !6
%13 = getelementptr inbounds double, ptr %12, i64 %9
%wide.load39.us = load <vscale x 2 x double>, ptr %13, align 8, !tbaa !6
%14 = fmul <vscale x 2 x double> %wide.load.us, %wide.load38.us
%15 = fmul <vscale x 2 x double> %wide.load37.us, %wide.load39.us
%16 = tail call double @llvm.vector.reduce.fadd.nxv2f64(double %vec.phi.us, <vscale x 2 x double> %14)
%17 = tail call double @llvm.vector.reduce.fadd.nxv2f64(double %16, <vscale x 2 x double> %15)
%index.next.us = add nuw i64 %index.us, %2
%18 = icmp eq i64 %index.next.us, %n.vec.us
br i1 %18, label %middle.block.us, label %vector.body.us, !llvm.loop !10
middle.block.us: ; preds = %vector.body.us
%cmp.n.us = icmp eq i64 %n.mod.vf.us, 0
br i1 %cmp.n.us, label %for.cond.cleanup3.us, label %for.body4.us.preheader
for.body4.us.preheader: ; preds = %middle.block.us, %for.cond1.preheader.us
%indvars.iv29.us.ph = phi i64 [ %indvars.iv.us, %for.cond1.preheader.us ], [ %ind.end.us, %middle.block.us ]
%temp_value.023.us.ph = phi double [ 0.000000e+00, %for.cond1.preheader.us ], [ %17, %middle.block.us ]
br label %for.body4.us
for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us
%indvars.iv29.us = phi i64 [ %indvars.iv.next30.us, %for.body4.us ], [ %indvars.iv29.us.ph, %for.body4.us.preheader ]
%temp_value.023.us = phi double [ %21, %for.body4.us ], [ %temp_value.023.us.ph, %for.body4.us.preheader ]
%arrayidx.us = getelementptr inbounds double, ptr %values, i64 %indvars.iv29.us
%19 = load double, ptr %arrayidx.us, align 8, !tbaa !6
%arrayidx6.us = getelementptr inbounds double, ptr %x, i64 %indvars.iv29.us
%20 = load double, ptr %arrayidx6.us, align 8, !tbaa !6
%21 = tail call double @llvm.fmuladd.f64(double %19, double %20, double %temp_value.023.us)
%indvars.iv.next30.us = add nsw i64 %indvars.iv29.us, 1
%cmp2.us = icmp slt i64 %indvars.iv.next30.us, %3
br i1 %cmp2.us, label %for.body4.us, label %for.cond.cleanup3.us, !llvm.loop !14
for.cond.cleanup3.us: ; preds = %for.body4.us, %middle.block.us
%temp_value.0.lcssa.us = phi double [ %17, %middle.block.us ], [ %21, %for.body4.us ]
%arrayidx9.us = getelementptr inbounds double, ptr %b, i64 %indvars.iv32.us
store double %temp_value.0.lcssa.us, ptr %arrayidx9.us, align 8, !tbaa !6
%indvars.iv.next33.us = add nuw nsw i64 %indvars.iv32.us, 1
%exitcond.not.us = icmp eq i64 %indvars.iv.next33.us, %wide.trip.count
br i1 %exitcond.not.us, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !15
for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
%indvars.iv32 = phi i64 [ %indvars.iv.next33, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
%arrayidx9 = getelementptr inbounds double, ptr %b, i64 %indvars.iv32
store double 0.000000e+00, ptr %arrayidx9, align 8, !tbaa !6
%indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
%exitcond.not = icmp eq i64 %indvars.iv.next33, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !15
for.cond.cleanup: ; preds = %for.cond1.preheader, %for.cond.cleanup3.us, %entry
ret void
}
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D141590/new/
https://reviews.llvm.org/D141590
More information about the llvm-commits
mailing list