[llvm] [LV][VPlan] Not adding shuffle cost when store loop invariant value. (PR #109644)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 03:05:19 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Elvis Wang (ElvisWang123)
<details>
<summary>Changes</summary>
This patch makes VPWidenMemoryRecipe and the legacy model not adding reverse shuffle cost when the stored value is loop invariant.
This patch also fixes #<!-- -->109468.
---
Full diff: https://github.com/llvm/llvm-project/pull/109644.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+4-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+3-1)
- (added) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll (+160)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 806dbd5bb7304e..ec43cafc685a9e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5666,7 +5666,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
}
bool Reverse = ConsecutiveStride < 0;
- if (Reverse)
+ const StoreInst *SI = dyn_cast<StoreInst>(I);
+ bool IsLoopInvariantStoreValue =
+ SI && Legal->isInvariant(const_cast<StoreInst *>(SI)->getValueOperand());
+ if (Reverse && !IsLoopInvariantStoreValue)
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
CostKind, 0);
return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a2cfbc35cb84f..8bd67109403d35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2253,7 +2253,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
CostKind, OpInfo, &Ingredient);
}
- if (!Reverse)
+ // If the store value is a live-in scalar value which is uniform, we don't
+ // need to calculate the reverse cost.
+ if (!Reverse || (isa<StoreInst>(Ingredient) && getOperand(1)->isLiveIn()))
return Cost;
return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
new file mode 100644
index 00000000000000..7fd9b50a53836d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S| FileCheck %s
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S| FileCheck %s --check-prefixes=EVL
+; COM: From issue #109468
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @lshift_significand(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[INDEX]]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP12]]
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_IV]], i32 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP14]], i64 [[TMP1]])
+; CHECK-NEXT: [[IDXPROM12:%.*]] = sub nuw nsw i64 1, [[TMP11]]
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[IDXPROM12]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i64, ptr [[TMP21]], i64 [[TMP20]]
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[REVERSE1]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[REVERSE]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY9:.*]]
+; CHECK: [[FOR_BODY9]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
+; CHECK-NEXT: [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
+; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_END16]]:
+; CHECK-NEXT: ret void
+;
+; EVL-LABEL: define void @lshift_significand(
+; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; EVL-NEXT: [[ENTRY:.*]]:
+; EVL-NEXT: [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; EVL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; EVL-NEXT: [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; EVL-NEXT: [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; EVL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; EVL-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL: [[VECTOR_PH]]:
+; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT: [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; EVL: [[VECTOR_BODY]]:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
+; EVL-NEXT: [[SUB11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
+; EVL-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; EVL-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; EVL-NEXT: [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; EVL-NEXT: [[TMP15:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP14]]
+; EVL-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; EVL-NEXT: [[TMP18:%.*]] = mul i64 0, [[TMP17]]
+; EVL-NEXT: [[TMP19:%.*]] = sub i64 1, [[TMP17]]
+; EVL-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
+; EVL-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
+; EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
+; EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[SUB11]])
+; EVL-NEXT: [[IDXPROM12:%.*]] = zext i32 [[SUB11]] to i64
+; EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[IDXPROM12]], [[EVL_BASED_IV]]
+; EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL: [[MIDDLE_BLOCK]]:
+; EVL-NEXT: br i1 true, label %[[FOR_END16:.*]], label %[[SCALAR_PH]]
+; EVL: [[SCALAR_PH]]:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; EVL-NEXT: br label %[[FOR_BODY9:.*]]
+; EVL: [[FOR_BODY9]]:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY9]] ]
+; EVL-NEXT: [[TMP24:%.*]] = sub nuw nsw i64 1, [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX13:%.*]] = getelementptr [3 x i64], ptr [[TMP0]], i64 0, i64 [[TMP24]]
+; EVL-NEXT: store i64 0, ptr [[ARRAYIDX13]], align 8
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 3
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END16]], label %[[FOR_BODY9]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL: [[FOR_END16]]:
+; EVL-NEXT: ret void
+;
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
+entry:
+ %cmp1.peel = icmp eq i32 %n, 0
+ %spec.select = select i1 %cmp1.peel, i64 2, i64 0
+ br label %for.body9
+
+for.body9: ; preds = %entry, %for.body9
+ %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
+ %1 = sub nuw nsw i64 1, %indvars.iv
+ %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
+ store i64 0, ptr %arrayidx13, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 3
+ br i1 %exitcond.not, label %for.end16, label %for.body9
+
+for.end16: ; preds = %for.body9
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
``````````
</details>
https://github.com/llvm/llvm-project/pull/109644
More information about the llvm-commits
mailing list