[PATCH] D141590: [PassManager] Add some passes to the sequence of extra vector passes

Thu Jan 12 03:30:12 PST 2023

TiehuZhang added a comment.

The patch may be helpful for generating better IR in some cases, e.g.,

  void testNestLoop(const int nCells, const double *x, double *b, const double *values, const int max_row_length){

      int value_index = 0;

      for(int i = 0; i < nCells; i++) {

          double temp_value = 0.0;

          for (int j = value_index; j < value_index + max_row_length; j++) {

             temp_value += values[j] * x[j];

          }

          value_index += max_row_length;

          b[i] = temp_value;

      }

  }

`option: -O3 -march=armv8.2-a+sve -S -emit-llvm  -mllvm -extra-vectorizer-passes=true`

**Without the patch**

  entry:
    %cmp26 = icmp sgt i32 %nCells, 0
    br i1 %cmp26, label %for.cond1.preheader.preheader, label %for.cond.cleanup

  for.cond1.preheader.preheader:                    ; preds = %entry
    %0 = sext i32 %max_row_length to i64
    %wide.trip.count = zext i32 %nCells to i64
    %cmp222 = icmp sgt i32 %max_row_length, 0
    br i1 %cmp222, label %for.cond1.preheader.preheader.split.us, label %for.cond1.preheader

  for.cond1.preheader.preheader.split.us:           ; preds = %for.cond1.preheader.preheader
    %1 = tail call i64 @llvm.vscale.i64()
    %2 = shl nuw nsw i64 %1, 2
    br label %for.cond1.preheader.us

  ...

  for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
    %indvars.iv32 = phi i64 [ %indvars.iv.next33, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
    %arrayidx9 = getelementptr inbounds double, ptr %b, i64 %indvars.iv32
    store double 0.000000e+00, ptr %arrayidx9, align 8, !tbaa !6
    %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
    %exitcond.not = icmp eq i64 %indvars.iv.next33, %wide.trip.count
    br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !15

  for.cond.cleanup:                                 ; preds = %for.cond1.preheader, %for.cond.cleanup3.us, %entry
    ret void

**With the patch**

  entry:
    %cmp26 = icmp sgt i32 %nCells, 0
    br i1 %cmp26, label %for.cond1.preheader.preheader, label %for.cond.cleanup

  for.cond1.preheader.preheader:                    ; preds = %entry
    %0 = sext i32 %max_row_length to i64
    %wide.trip.count = zext i32 %nCells to i64
    %cmp222 = icmp sgt i32 %max_row_length, 0
    br i1 %cmp222, label %for.cond1.preheader.preheader.split.us, label %for.cond1.preheader.preheader.split

  for.cond1.preheader.preheader.split.us:           ; preds = %for.cond1.preheader.preheader
    %1 = tail call i64 @llvm.vscale.i64()
    %2 = shl nuw nsw i64 %1, 2
    br label %for.cond1.preheader.us

  ...

  for.cond1.preheader.preheader.split:              ; preds = %for.cond1.preheader.preheader
    %22 = shl nuw nsw i64 %wide.trip.count, 3
    tail call void @llvm.memset.p0.i64(ptr align 8 %b, i8 0, i64 %22, i1 false), !tbaa !6
    br label %for.cond.cleanup

  for.cond.cleanup:                                 ; preds = %for.cond.cleanup3.us, %for.cond1.preheader.preheader.split, %entry
    ret void

With the patch, the simple loop `for.cond1.preheader` is transformed to more efficient `memset`, which may bring performance benefits to the nestloop.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141590/new/

https://reviews.llvm.org/D141590