[PATCH] D158988: [LV] Choose the wider VF where they have same cost

Mon Aug 28 07:17:55 PDT 2023

Allen created this revision.
Allen added reviewers: sdesmalen, dmgreen, bmahjour, david-arm, ctetreau.
Herald added subscribers: artagnon, hiraditya.
Herald added a project: All.
Allen requested review of this revision.
Herald added subscribers: llvm-commits, wangpc.
Herald added a project: LLVM.

sometimes, different VF will get same cost, and prefer to
the wider VF to improve the parallelism degree

      

Fixes https://github.com/llvm/llvm-project/issues/64986


https://reviews.llvm.org/D158988

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll


Index: llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll
===================================================================

--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll
@@ -0,0 +1,57 @@
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes=loop-vectorize -pass-remarks=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; prefer vscale x 16 when vscale x 16 and vscale x 8 have same cost.
+; CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: vscale x 16, interleaved count: 2)
+
+define void @pixel_avg(ptr noalias nocapture %dst, i32 %i_dst_stride, ptr noalias nocapture %src1, i32 %i_src1_stride, ptr noalias nocapture %src2, i32 %i_src2_stride, i32 %i_width, i32 %i_height) {
+entry:
+  %cmp29 = icmp sgt i32 %i_height, 0
+  br i1 %cmp29, label %for.preheader.lr.ph, label %cleanup
+
+for.preheader.lr.ph:                        ; preds = %entry
+  %cmp227 = icmp sgt i32 %i_width, 0
+  %idx.ext = sext i32 %i_dst_stride to i64
+  %idx.ext12 = sext i32 %i_src1_stride to i64
+  %idx.ext14 = sext i32 %i_src2_stride to i64
+  %wide.trip.count = zext i32 %i_width to i64
+  br i1 %cmp227, label %for.preheader, label %cleanup
+
+for.preheader:                           ; preds = %for.preheader.lr.ph, %for.latch
+  %y.033.us = phi i32 [ %inc17.us, %for.latch ], [ 0, %for.preheader.lr.ph ]
+  %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.latch ], [ %dst, %for.preheader.lr.ph ]
+  %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.latch ], [ %src1, %for.preheader.lr.ph ]
+  %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.latch ], [ %src2, %for.preheader.lr.ph ]
+  br label %for.body
+
+for.body:                                     ; preds = %for.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %conv.us = zext i8 %0 to i16
+  %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx6.us, align 1
+  %conv7.us = zext i8 %1 to i16
+  %add.us = add nuw nsw i16 %conv.us, 1
+  %add8.us = add nuw nsw i16 %add.us, %conv7.us
+  %shr.us = lshr i16 %add8.us, 1
+  %conv9.us = trunc i16 %shr.us to i8
+  %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
+  store i8 %conv9.us, ptr %arrayidx11.us, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.latch, label %for.body
+
+for.latch:         ; preds = %for.body
+  %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
+  %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
+  %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
+  %inc17.us = add nuw nsw i32 %y.033.us, 1
+  %exitcond36.not = icmp eq i32 %inc17.us, %i_height
+  br i1 %exitcond36.not, label %cleanup, label %for.preheader
+
+cleanup:                                 ; preds = %for.latch, %for.preheader.lr.ph, %entry
+  ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5220,9 +5220,10 @@
     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
 
   // To avoid the need for FP division:
-  //      (CostA / A.Width) < (CostB / B.Width)
-  // <=>  (CostA * B.Width) < (CostB * A.Width)
-  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
+  //      (CostA / A.Width) <= (CostB / B.Width)
+  // <=>  (CostA * B.Width) <= (CostB * A.Width)
+  // Choose the wider VF where they have same cost.
+  return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA);
 }
 
 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D158988.553923.patch
Type: text/x-patch
Size: 4141 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20230828/6581cf7f/attachment.bin>