[llvm] 0ab0c1d - [SLP]Introduce transformNodes() and transform loads + reverse to strided loads.
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 22 09:32:00 PDT 2024
Author: Alexey Bataev
Date: 2024-04-22T12:31:57-04:00
New Revision: 0ab0c1d982876662a45adb9bafaa3c2d3bdf1939
URL: https://github.com/llvm/llvm-project/commit/0ab0c1d982876662a45adb9bafaa3c2d3bdf1939
DIFF: https://github.com/llvm/llvm-project/commit/0ab0c1d982876662a45adb9bafaa3c2d3bdf1939.diff
LOG: [SLP]Introduce transformNodes() and transform loads + reverse to strided loads.
Introduced transformNodes() function to perform transformation of the
nodes (cost-based, instruction count based, etc.).
Implemented transformation of consecutive loads + reverse order to
strided loads with stride -1, if profitable.
Reviewers: RKSimon, preames, topperc
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/88530
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 685a5907d94fe7..6ac380a6ab6c6c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1126,6 +1126,9 @@ class BoUpSLP {
void
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
+ /// Transforms graph nodes to target specific representations, if profitable.
+ void transformNodes();
+
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
return std::make_pair(ScalarCost, VecCost);
}
+void BoUpSLP::transformNodes() {
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ TreeEntry &E = *TE.get();
+ switch (E.getOpcode()) {
+ case Instruction::Load: {
+ Type *ScalarTy = E.getMainOp()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+ // Check if profitable to represent consecutive load + reverse as strided
+ // load with stride -1.
+ if (isReverseOrder(E.ReorderIndices) &&
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SmallVector<int> Mask;
+ inversePermutation(E.ReorderIndices, Mask);
+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+ InstructionCost OriginalVecCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+ BaseLI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+ Instruction::Load, VecTy, BaseLI->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
+ if (StridedCost < OriginalVecCost)
+ // Strided load is more profitable than consecutive load + reverse -
+ // transform the node to strided load.
+ E.State = TreeEntry::StridedVectorize;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
+ V.transformNodes();
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 03acc0009fb04c..44d320c75fedd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
More information about the llvm-commits
mailing list