[llvm] [LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2) (PR #95819)

Mon Jul 15 01:44:04 PDT 2024

https://github.com/sjoerdmeijer updated https://github.com/llvm/llvm-project/pull/95819

>From 2c50720ab53d3f02ec30f632e38c65ebaadc8613 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Mon, 17 Jun 2024 16:02:18 +0530
Subject: [PATCH] [LV][AArch64] Prefer Fixed over Scalable if cost-model is
 equal (Neoverse V2)

For the Neoverse V2 we would like to prefer fixed width over scalable
vectorisation if the cost-model assigns an equal cost to both for certain
loops. This improves 7 kernels from TSVC-2 and several production kernels by
about 2x, and does not affect SPEC21017 INT and FP. This also adds a new TTI
hook that can steer the loop vectorizater to preferring fixed width
vectorization, which can be set per CPU. For now, this is only enabled for the
Neoverse V2.

There are 3 reasons why preferring NEON might be better in the case the
cost-model is a tie and the SVE vector size is the same as NEON (128-bit):
architectural reasons, micro-architecture reasons, and SVE codegen reasons. The
latter will be improved over time, so the more important reasons are the former
two. I.e., (micro) architecture reason is the use of LPD/STP instructions which
are not available in SVE2 and it avoids predication.

For what it is worth: this codegen strategy to generate more NEON is inline
with GCC's codegen strategy, which is actually even more aggressive in
generating NEON when no predication is required. We could be smarter about the
decision making, but this seems to be a first good step in the right direction,
and we can always revise this later (for example make the target hook more
general).
---
 .../llvm/Analysis/TargetTransformInfo.h       |  9 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++
 llvm/lib/Target/AArch64/AArch64Features.td    |  4 ++
 llvm/lib/Target/AArch64/AArch64Processors.td  |  1 +
 .../AArch64/AArch64TargetTransformInfo.h      |  4 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +-
 .../prefer-fixed-if-equal-to-scalable.ll      | 60 +++++++++++++++++++
 8 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index dcdd9f82cde8e..41641e08293ec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1679,6 +1679,11 @@ class TargetTransformInfo {
         false; ///< If op is an fp min/max, whether NaNs may be present.
   };
 
+  /// \returns True if the targets prefers fixed width vectorization if the
+  /// loop vectorizer's cost-model assigns an equal cost to the fixed and
+  /// scalable version of the vectorized loop.
+  bool preferFixedOverScalableIfEqualCost() const;
+
   /// \returns True if the target prefers reductions in loop.
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const;
@@ -2149,6 +2154,7 @@ class TargetTransformInfo::Concept {
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual bool preferFixedOverScalableIfEqualCost() const = 0;
   virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
   virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -2882,6 +2888,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  bool preferFixedOverScalableIfEqualCost() const override {
+    return Impl.preferFixedOverScalableIfEqualCost();
+  }
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const override {
     return Impl.preferInLoopReduction(Opcode, Ty, Flags);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 01624de190d51..75ccf3900829d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -920,6 +920,8 @@ class TargetTransformInfoImplBase {
     return VF;
   }
 
+  bool preferFixedOverScalableIfEqualCost() const { return false; }
+
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const {
     return false;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c175d1737e54b..1ef0c6a3606fb 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1286,6 +1286,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
+  return TTIImpl->preferFixedOverScalableIfEqualCost();
+}
+
 bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                                 ReductionFlags Flags) const {
   return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index e523957afc25a..832e44fe117e2 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -355,6 +355,10 @@ def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE",
 //  Armv9.0 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
+def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
+  "UseFixedOverScalableIfEqualCost", "true",
+  "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
+
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 87927093a2c4c..71384a23c49af 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -525,6 +525,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
+                                      FeatureUseFixedOverScalableIfEqualCost,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 3eb9aa963c018..a9189fd53f40b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -371,6 +371,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return TailFoldingStyle::DataWithoutLaneMask;
   }
 
+  bool preferFixedOverScalableIfEqualCost() const {
+    return ST->useFixedOverScalableIfEqualCost();
+  }
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
 
   bool supportsScalableVectors() const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5520baef7152d..00a1664ec1819 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4629,7 +4629,9 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
-  bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
+  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+                        A.Width.isScalable() && !B.Width.isScalable();
+
   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
                                 const InstructionCost &RHS) {
     return PreferScalable ? LHS <= RHS : LHS < RHS;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
new file mode 100644
index 0000000000000..41595cc7d8996
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+ at a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
+ at b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
+
+define void @NeoverseV2() #0 {
+; CHECK-LABEL: define void @NeoverseV2(
+; CHECK:       store <4 x float>
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  %add = fadd fast float %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 16000
+  %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
+  store float %add, ptr %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 16000
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @GenericCPU() #1 {
+; CHECK-LABEL: define void @GenericCPU(
+; CHECK:       store <vscale x 4 x float>
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
+  %add = fadd fast float %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 16000
+  %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
+  store float %add, ptr %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 16000
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" }
+attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }