[PATCH] D141590: [PassManager] Add some passes to the sequence of extra vector passes

Thu Jan 12 04:14:54 PST 2023

TiehuZhang added a comment.

In D141590#4046933 <https://reviews.llvm.org/D141590#4046933>, @fhahn wrote:

> Could you share a full IR test showing the improvements? The main goal of the extra vectorizer passes is to optimize the vector code, not necessarily other scalar loops.

Hi, @fhahn , this case is a nestloop. The inner loop has been SVE vectorized. But there is a special scenario, that is the tripcount of the inner loop `max_ row_ If length` is 0, then the outerloop will degenerate into a memset operation as shown in the `for.cond1.preheader`. Without this patch, the outer loop will execute a scalar loop (`for.cond1.preheader`). With the patch, this loop will be optimized to @llvm.memset.p0.i64 (shown in `for.cond1.preheader.preheader.split:`), which is more efficient.

  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
  target triple = "aarch64-unknown-linux-gnu"

  ; Function Attrs: nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16)
  define dso_local void @testNestLoop(i32 noundef %nCells, ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %b, ptr nocapture noundef readonly %values, i32 noundef %max_row_length) local_unnamed_addr #0 {
  entry:
    %cmp26 = icmp sgt i32 %nCells, 0
    br i1 %cmp26, label %for.cond1.preheader.preheader, label %for.cond.cleanup

  for.cond1.preheader.preheader:                    ; preds = %entry
    %0 = sext i32 %max_row_length to i64
    %wide.trip.count = zext i32 %nCells to i64
    %cmp222 = icmp sgt i32 %max_row_length, 0
    br i1 %cmp222, label %for.cond1.preheader.preheader.split.us, label %for.cond1.preheader

  for.cond1.preheader.preheader.split.us:           ; preds = %for.cond1.preheader.preheader
    %1 = tail call i64 @llvm.vscale.i64()
    %2 = shl nuw nsw i64 %1, 2
    br label %for.cond1.preheader.us

  for.cond1.preheader.us:                           ; preds = %for.cond.cleanup3.us, %for.cond1.preheader.preheader.split.us
    %indvars.iv32.us = phi i64 [ 0, %for.cond1.preheader.preheader.split.us ], [ %indvars.iv.next33.us, %for.cond.cleanup3.us ]
    %indvars.iv.us = phi i64 [ 0, %for.cond1.preheader.preheader.split.us ], [ %3, %for.cond.cleanup3.us ]
    %3 = add i64 %indvars.iv.us, %0
    %4 = add i64 %indvars.iv.us, 1
    %smax.us = tail call i64 @llvm.smax.i64(i64 %3, i64 %4)
    %5 = mul i64 %indvars.iv32.us, %0
    %6 = sub i64 %smax.us, %5
    %min.iters.check.us = icmp ult i64 %6, %2
    br i1 %min.iters.check.us, label %for.body4.us.preheader, label %vector.ph.us

  vector.ph.us:                                     ; preds = %for.cond1.preheader.us
    %n.mod.vf.us = urem i64 %6, %2
    %n.vec.us = sub i64 %6, %n.mod.vf.us
    %ind.end.us = add i64 %indvars.iv.us, %n.vec.us
    %7 = tail call i32 @llvm.vscale.i32()
    %8 = shl nuw nsw i32 %7, 1
    %9 = zext i32 %8 to i64
    br label %vector.body.us

  vector.body.us:                                   ; preds = %vector.body.us, %vector.ph.us
    %index.us = phi i64 [ 0, %vector.ph.us ], [ %index.next.us, %vector.body.us ]
    %vec.phi.us = phi double [ 0.000000e+00, %vector.ph.us ], [ %17, %vector.body.us ]
    %offset.idx.us = add i64 %indvars.iv.us, %index.us
    %10 = getelementptr inbounds double, ptr %values, i64 %offset.idx.us
    %wide.load.us = load <vscale x 2 x double>, ptr %10, align 8, !tbaa !6
    %11 = getelementptr inbounds double, ptr %10, i64 %9
    %wide.load37.us = load <vscale x 2 x double>, ptr %11, align 8, !tbaa !6
    %12 = getelementptr inbounds double, ptr %x, i64 %offset.idx.us
    %wide.load38.us = load <vscale x 2 x double>, ptr %12, align 8, !tbaa !6
    %13 = getelementptr inbounds double, ptr %12, i64 %9
    %wide.load39.us = load <vscale x 2 x double>, ptr %13, align 8, !tbaa !6
    %14 = fmul <vscale x 2 x double> %wide.load.us, %wide.load38.us
    %15 = fmul <vscale x 2 x double> %wide.load37.us, %wide.load39.us
    %16 = tail call double @llvm.vector.reduce.fadd.nxv2f64(double %vec.phi.us, <vscale x 2 x double> %14)
    %17 = tail call double @llvm.vector.reduce.fadd.nxv2f64(double %16, <vscale x 2 x double> %15)
    %index.next.us = add nuw i64 %index.us, %2
    %18 = icmp eq i64 %index.next.us, %n.vec.us
    br i1 %18, label %middle.block.us, label %vector.body.us, !llvm.loop !10

  middle.block.us:                                  ; preds = %vector.body.us
    %cmp.n.us = icmp eq i64 %n.mod.vf.us, 0
    br i1 %cmp.n.us, label %for.cond.cleanup3.us, label %for.body4.us.preheader

  for.body4.us.preheader:                           ; preds = %middle.block.us, %for.cond1.preheader.us
    %indvars.iv29.us.ph = phi i64 [ %indvars.iv.us, %for.cond1.preheader.us ], [ %ind.end.us, %middle.block.us ]
    %temp_value.023.us.ph = phi double [ 0.000000e+00, %for.cond1.preheader.us ], [ %17, %middle.block.us ]
    br label %for.body4.us

  for.body4.us:                                     ; preds = %for.body4.us.preheader, %for.body4.us
    %indvars.iv29.us = phi i64 [ %indvars.iv.next30.us, %for.body4.us ], [ %indvars.iv29.us.ph, %for.body4.us.preheader ]
    %temp_value.023.us = phi double [ %21, %for.body4.us ], [ %temp_value.023.us.ph, %for.body4.us.preheader ]
    %arrayidx.us = getelementptr inbounds double, ptr %values, i64 %indvars.iv29.us
    %19 = load double, ptr %arrayidx.us, align 8, !tbaa !6
    %arrayidx6.us = getelementptr inbounds double, ptr %x, i64 %indvars.iv29.us
    %20 = load double, ptr %arrayidx6.us, align 8, !tbaa !6
    %21 = tail call double @llvm.fmuladd.f64(double %19, double %20, double %temp_value.023.us)
    %indvars.iv.next30.us = add nsw i64 %indvars.iv29.us, 1
    %cmp2.us = icmp slt i64 %indvars.iv.next30.us, %3
    br i1 %cmp2.us, label %for.body4.us, label %for.cond.cleanup3.us, !llvm.loop !14

  for.cond.cleanup3.us:                             ; preds = %for.body4.us, %middle.block.us
    %temp_value.0.lcssa.us = phi double [ %17, %middle.block.us ], [ %21, %for.body4.us ]
    %arrayidx9.us = getelementptr inbounds double, ptr %b, i64 %indvars.iv32.us
    store double %temp_value.0.lcssa.us, ptr %arrayidx9.us, align 8, !tbaa !6
    %indvars.iv.next33.us = add nuw nsw i64 %indvars.iv32.us, 1
    %exitcond.not.us = icmp eq i64 %indvars.iv.next33.us, %wide.trip.count
    br i1 %exitcond.not.us, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !15

  for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
    %indvars.iv32 = phi i64 [ %indvars.iv.next33, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
    %arrayidx9 = getelementptr inbounds double, ptr %b, i64 %indvars.iv32
    store double 0.000000e+00, ptr %arrayidx9, align 8, !tbaa !6
    %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
    %exitcond.not = icmp eq i64 %indvars.iv.next33, %wide.trip.count
    br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !15

  for.cond.cleanup:                                 ; preds = %for.cond1.preheader, %for.cond.cleanup3.us, %entry
    ret void
  }

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141590/new/

https://reviews.llvm.org/D141590