[PATCH] D158988: [LV] Choose the wider VF where they have same cost
Allen zhong via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 28 07:17:55 PDT 2023
Allen created this revision.
Allen added reviewers: sdesmalen, dmgreen, bmahjour, david-arm, ctetreau.
Herald added subscribers: artagnon, hiraditya.
Herald added a project: All.
Allen requested review of this revision.
Herald added subscribers: llvm-commits, wangpc.
Herald added a project: LLVM.
sometimes, different VF will get same cost, and prefer to
the wider VF to improve the parallelism degree
Fixes https://github.com/llvm/llvm-project/issues/64986
https://reviews.llvm.org/D158988
Files:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll
Index: llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll
@@ -0,0 +1,57 @@
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes=loop-vectorize -pass-remarks=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; prefer vscale x 16 when vscale x 16 and vscale x 8 have same cost.
+; CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: vscale x 16, interleaved count: 2)
+
+define void @pixel_avg(ptr noalias nocapture %dst, i32 %i_dst_stride, ptr noalias nocapture %src1, i32 %i_src1_stride, ptr noalias nocapture %src2, i32 %i_src2_stride, i32 %i_width, i32 %i_height) {
+entry:
+ %cmp29 = icmp sgt i32 %i_height, 0
+ br i1 %cmp29, label %for.preheader.lr.ph, label %cleanup
+
+for.preheader.lr.ph: ; preds = %entry
+ %cmp227 = icmp sgt i32 %i_width, 0
+ %idx.ext = sext i32 %i_dst_stride to i64
+ %idx.ext12 = sext i32 %i_src1_stride to i64
+ %idx.ext14 = sext i32 %i_src2_stride to i64
+ %wide.trip.count = zext i32 %i_width to i64
+ br i1 %cmp227, label %for.preheader, label %cleanup
+
+for.preheader: ; preds = %for.preheader.lr.ph, %for.latch
+ %y.033.us = phi i32 [ %inc17.us, %for.latch ], [ 0, %for.preheader.lr.ph ]
+ %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.latch ], [ %dst, %for.preheader.lr.ph ]
+ %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.latch ], [ %src1, %for.preheader.lr.ph ]
+ %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.latch ], [ %src2, %for.preheader.lr.ph ]
+ br label %for.body
+
+for.body: ; preds = %for.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %conv.us = zext i8 %0 to i16
+ %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx6.us, align 1
+ %conv7.us = zext i8 %1 to i16
+ %add.us = add nuw nsw i16 %conv.us, 1
+ %add8.us = add nuw nsw i16 %add.us, %conv7.us
+ %shr.us = lshr i16 %add8.us, 1
+ %conv9.us = trunc i16 %shr.us to i8
+ %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
+ store i8 %conv9.us, ptr %arrayidx11.us, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.latch, label %for.body
+
+for.latch: ; preds = %for.body
+ %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
+ %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
+ %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
+ %inc17.us = add nuw nsw i32 %y.033.us, 1
+ %exitcond36.not = icmp eq i32 %inc17.us, %i_height
+ br i1 %exitcond36.not, label %cleanup, label %for.preheader
+
+cleanup: ; preds = %for.latch, %for.preheader.lr.ph, %entry
+ ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5220,9 +5220,10 @@
return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
- // (CostA / A.Width) < (CostB / B.Width)
- // <=> (CostA * B.Width) < (CostB * A.Width)
- return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
+ // (CostA / A.Width) <= (CostB / B.Width)
+ // <=> (CostA * B.Width) <= (CostB * A.Width)
+ // Choose the wider VF where they have same cost.
+ return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA);
}
static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D158988.553923.patch
Type: text/x-patch
Size: 4141 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20230828/6581cf7f/attachment.bin>
More information about the llvm-commits
mailing list