[llvm] Discourage vectorisation of small loops with gathers and scatters (PR #109262)

Thu Sep 19 02:58:46 PDT 2024

https://github.com/VladiKrapp-Arm created https://github.com/llvm/llvm-project/pull/109262

On Cortex-M85, our benchmarks are faster with scalar loops than vectorised loops with gathers and scatters on. This patch disables gathers and scatters for Cortex-M85, as an initial approach, and discourages vectorisation of the loop to encourage scalarisation. This increases performance in several benchmarks.

Additionally, don't allow tail predication with strided accesses if gather scatter is disabled

>From ac8cf305bd7f330e0ba6de87ff8c06bb2b602497 Mon Sep 17 00:00:00 2001
From: Vladi Krapp <vladi.krapp at arm.com>
Date: Thu, 19 Sep 2024 10:55:31 +0100
Subject: [PATCH] Discourage vectorisation of small loops with gathers and
 scatters

On Cortex-M85, our benchmarks are faster with scalar loops than vectorised loops with gathers and scatters on. This patch disables gathers and scatters for Cortex-M85, as an initial approach, and discourages vectorisation of the loop to encourage scalarisation. This increases performance in several benchmarks.

Additionally, don't allow tail predication with strided accesses if gather scatter is disabled
---
 llvm/lib/Target/ARM/ARMFeatures.td            |  6 ++++++
 llvm/lib/Target/ARM/ARMProcessors.td          |  1 +
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 20 +++++++++++++------
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 8b0ade54b46d3c..6fe3f1cc5becc9 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -254,6 +254,12 @@ def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
                                              "true",
                                              "Has slow VDUP32 - prefer VMOV">;
 
+// For some MVE-enabled processors, scalar loops are preferred when they are small enough
+// that the overhead of gathers and scatters is detrimental to performance.
+// In these cases, prefer to not use gathers and scatters, or avoid vectorising entirely.
+def FeaturePreferScalarToGatherScatter : SubtargetFeature<"prefer-scalar-to-gatscat", "PreferScalarToGatherScatter",
+                                             "true", "Disable gather and scatter instructions">;
+
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
 // True if VMOVSR will be favored over VMOVDRR.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index e4e122a0d1339b..cb41026746aa6b 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -386,6 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model,      [ARMv81mMainline,
                                                          FeatureFPARMv8_D16,
                                                          FeaturePACBTI,
                                                          FeatureUseMISched,
+                                                         FeaturePreferScalarToGatherScatter,
                                                          HasMVEFloatOps]>;
 
 def : ProcessorModel<"cortex-m52", CortexM55Model,      [ARMv81mMainline,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 912569a8fec118..2ae84775c95021 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1075,14 +1075,19 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
   unsigned NumVectorInstToHideOverhead = 10;
   int MaxMergeDistance = 64;
 
-  if (ST->hasNEON()) {
+  // For processors that prefer scalar loops to vector loops with gathers and
+  // scatters we disable them, but in a vectorised loop this results in a load
+  // of contiguous loads and stores that make the vectorised loop slower than a
+  // scalar loop, so increase the cost of those scalar loads and stores.
+  if (ST->hasNEON() || ST->preferScalarToGatherScatter()) {
     if (Ty->isVectorTy() && SE &&
         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
       return NumVectorInstToHideOverhead;
 
     // In many cases the address computation is not merged into the instruction
     // addressing mode.
-    return 1;
+    if (ST->hasNEON())
+      return 1;
   }
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
@@ -1125,7 +1130,8 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
 }
 
 bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
-  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
+  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps() ||
+      ST->preferScalarToGatherScatter())
     return false;
 
   unsigned EltWidth = Ty->getScalarSizeInBits();
@@ -2327,7 +2333,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 //
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
-                                 const LoopAccessInfo *LAI) {
+                                 const LoopAccessInfo *LAI,
+                                 const ARMSubtarget *ST) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
   // If there are live-out values, it is probably a reduction. We can predicate
@@ -2392,7 +2399,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                         "be tail-predicated\n.");
           return false;
           // TODO: don't tail predicate if there is a reversed load?
-        } else if (EnableMaskedGatherScatters) {
+        } else if (EnableMaskedGatherScatters &&
+                   !ST->preferScalarToGatherScatter()) {
           // Gather/scatters do allow loading from arbitrary strides, at
           // least if they are loop invariant.
           // TODO: Loop variant strides should in theory work, too, but
@@ -2465,7 +2473,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
     return false;
   }
 
-  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), ST);
 }
 
 TailFoldingStyle