[llvm] [SLP]Introduce transformNodes() and transform loads + reverse to strided loads. (PR #88530)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 12 08:57:48 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Introduced transformNodes() function to perform transformation of the
nodes (cost-based, instruction count based, etc.).
Implemented transformation of consecutive loads + reverse order to
strided loads with stride -1, if profitable.
---
Full diff: https://github.com/llvm/llvm-project/pull/88530.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+43)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll (+2-3)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index df891371fdf758..6f63b08581cd91 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1117,6 +1117,9 @@ class BoUpSLP {
void
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
+ /// Transforms graph nodes to target specific representations, if profitable.
+ void transformNodes();
+
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
@@ -7750,6 +7753,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
return std::make_pair(ScalarCost, VecCost);
}
+void BoUpSLP::transformNodes() {
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ TreeEntry &E = *TE.get();
+ switch (E.getOpcode()) {
+ case Instruction::Load: {
+ Type *ScalarTy = E.getMainOp()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+ // Check if profitable to represent consecutive load + reverse as strided
+ // load with stride -1.
+ if (isReverseOrder(E.ReorderIndices) &&
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SmallVector<int> Mask;
+ inversePermutation(E.ReorderIndices, Mask);
+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+ InstructionCost OriginalVecCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+ BaseLI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+ Instruction::Load, VecTy, BaseLI->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
+ if (StridedCost < OriginalVecCost)
+ // Strided load is more profitable than consecutive load + reverse -
+ // transform the node to strided load.
+ E.State = TreeEntry::StridedVectorize;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
@@ -15017,6 +15057,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
@@ -15387,6 +15428,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
@@ -16383,6 +16425,7 @@ class HorizontalReduction {
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
+ V.transformNodes();
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 03acc0009fb04c..44d320c75fedd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
``````````
</details>
https://github.com/llvm/llvm-project/pull/88530
More information about the llvm-commits
mailing list