[llvm] Discourage vectorisation of small loops with gathers and scatters (PR #109262)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 19 02:58:46 PDT 2024
https://github.com/VladiKrapp-Arm created https://github.com/llvm/llvm-project/pull/109262
On Cortex-M85, our benchmarks are faster with scalar loops than vectorised loops with gathers and scatters on. This patch disables gathers and scatters for Cortex-M85, as an initial approach, and discourages vectorisation of the loop to encourage scalarisation. This increases performance in several benchmarks.
Additionally, don't allow tail predication with strided accesses if gather scatter is disabled
>From ac8cf305bd7f330e0ba6de87ff8c06bb2b602497 Mon Sep 17 00:00:00 2001
From: Vladi Krapp <vladi.krapp at arm.com>
Date: Thu, 19 Sep 2024 10:55:31 +0100
Subject: [PATCH] Discourage vectorisation of small loops with gathers and
scatters
On Cortex-M85, our benchmarks are faster with scalar loops than vectorised loops with gathers and scatters on. This patch disables gathers and scatters for Cortex-M85, as an initial approach, and discourages vectorisation of the loop to encourage scalarisation. This increases performance in several benchmarks.
Additionally, don't allow tail predication with strided accesses if gather scatter is disabled
---
llvm/lib/Target/ARM/ARMFeatures.td | 6 ++++++
llvm/lib/Target/ARM/ARMProcessors.td | 1 +
.../lib/Target/ARM/ARMTargetTransformInfo.cpp | 20 +++++++++++++------
3 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 8b0ade54b46d3c..6fe3f1cc5becc9 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -254,6 +254,12 @@ def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
"true",
"Has slow VDUP32 - prefer VMOV">;
+// For some MVE-enabled processors, scalar loops are preferred when they are small enough
+// that the overhead of gathers and scatters is detrimental to performance.
+// In these cases, prefer to not use gathers and scatters, or avoid vectorising entirely.
+def FeaturePreferScalarToGatherScatter : SubtargetFeature<"prefer-scalar-to-gatscat", "PreferScalarToGatherScatter",
+ "true", "Disable gather and scatter instructions">;
+
// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
// for scalar FP, as this allows more effective execution domain optimization.
// True if VMOVSR will be favored over VMOVDRR.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index e4e122a0d1339b..cb41026746aa6b 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -386,6 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model, [ARMv81mMainline,
FeatureFPARMv8_D16,
FeaturePACBTI,
FeatureUseMISched,
+ FeaturePreferScalarToGatherScatter,
HasMVEFloatOps]>;
def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 912569a8fec118..2ae84775c95021 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1075,14 +1075,19 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;
- if (ST->hasNEON()) {
+ // For processors that prefer scalar loops to vector loops with gathers and
+ // scatters we disable them, but in a vectorised loop this results in a load
+ // of contiguous loads and stores that make the vectorised loop slower than a
+ // scalar loop, so increase the cost of those scalar loads and stores.
+ if (ST->hasNEON() || ST->preferScalarToGatherScatter()) {
if (Ty->isVectorTy() && SE &&
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
return NumVectorInstToHideOverhead;
// In many cases the address computation is not merged into the instruction
// addressing mode.
- return 1;
+ if (ST->hasNEON())
+ return 1;
}
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
@@ -1125,7 +1130,8 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
}
bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
- if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
+ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps() ||
+ ST->preferScalarToGatherScatter())
return false;
unsigned EltWidth = Ty->getScalarSizeInBits();
@@ -2327,7 +2333,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
//
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const DataLayout &DL,
- const LoopAccessInfo *LAI) {
+ const LoopAccessInfo *LAI,
+ const ARMSubtarget *ST) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
// If there are live-out values, it is probably a reduction. We can predicate
@@ -2392,7 +2399,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
"be tail-predicated\n.");
return false;
// TODO: don't tail predicate if there is a reversed load?
- } else if (EnableMaskedGatherScatters) {
+ } else if (EnableMaskedGatherScatters &&
+ !ST->preferScalarToGatherScatter()) {
// Gather/scatters do allow loading from arbitrary strides, at
// least if they are loop invariant.
// TODO: Loop variant strides should in theory work, too, but
@@ -2465,7 +2473,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
return false;
}
- return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+ return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), ST);
}
TailFoldingStyle
More information about the llvm-commits
mailing list