[llvm] 9448cdc - [SVE][Analysis] Tune the cost model according to the tune-cpu attribute

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 21 01:33:56 PDT 2021


Author: David Sherwood
Date: 2021-10-21T09:33:50+01:00
New Revision: 9448cdc90007611659ecbec4dca18a83f06bc4c3

URL: https://github.com/llvm/llvm-project/commit/9448cdc90007611659ecbec4dca18a83f06bc4c3
DIFF: https://github.com/llvm/llvm-project/commit/9448cdc90007611659ecbec4dca18a83f06bc4c3.diff

LOG: [SVE][Analysis] Tune the cost model according to the tune-cpu attribute

This patch introduces a new function:

  AArch64Subtarget::getVScaleForTuning

that returns a value for vscale that can be used for tuning the cost
model when using scalable vectors. The VScaleForTuning option in
AArch64Subtarget is initialised according to the following rules:

1. If the user has specified the CPU to tune for we use that, else
2. If the target CPU was specified we use that, else
3. The tuning is set to "generic".

For CPUs of type "generic" I have assumed that vscale=2.

New tests added here:

  Analysis/CostModel/AArch64/sve-gather.ll
  Analysis/CostModel/AArch64/sve-scatter.ll
  Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

Differential Revision: https://reviews.llvm.org/D110259

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64Subtarget.cpp
    llvm/lib/Target/AArch64/AArch64Subtarget.h
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
    llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
    llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 30dd67e619beb..7955ede0f3feb 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -80,8 +80,11 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case CortexA53:
   case CortexA55:
+    PrefFunctionLogAlignment = 4;
+    break;
   case CortexA510:
     PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
     break;
   case CortexA57:
     MaxInterleaveFactor = 4;
@@ -109,6 +112,7 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
+    VScaleForTuning = 4;
     break;
   case AppleA7:
   case AppleA10:
@@ -150,9 +154,15 @@ void AArch64Subtarget::initializeProperties() {
     PrefFunctionLogAlignment = 3;
     break;
   case NeoverseN1:
+    PrefFunctionLogAlignment = 4;
+    break;
   case NeoverseN2:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    break;
   case NeoverseV1:
     PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 2;
     break;
   case Saphira:
     MaxInterleaveFactor = 4;

diff  --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index c80a89330b774..e5167642bd5d3 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -280,6 +280,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   unsigned MinSVEVectorSizeInBits;
   unsigned MaxSVEVectorSizeInBits;
+  unsigned VScaleForTuning = 2;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
@@ -655,6 +656,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
 
   bool useSVEForFixedLengthVectors() const;
+
+  unsigned getVScaleForTuning() const { return VScaleForTuning; }
 };
 } // End llvm namespace
 

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0c9cba27e3eea..9d7434377b1d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1632,7 +1632,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   ElementCount LegalVF = LT.second.getVectorElementCount();
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
-  return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());
+  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b1ce616423c44..f7de7c0ee176d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -130,20 +130,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   /// when scalarizing an operation for a vector with ElementCount \p VF.
   /// For scalable vectors this currently takes the most pessimistic view based
   /// upon the maximum possible value for vscale.
-  unsigned getMaxNumElements(ElementCount VF,
-                             const Function *F = nullptr) const {
+  unsigned getMaxNumElements(ElementCount VF) const {
     if (!VF.isScalable())
       return VF.getFixedValue();
 
-    unsigned MaxNumVScale = 16;
-    if (F && F->hasFnAttribute(Attribute::VScaleRange)) {
-      unsigned VScaleMax =
-          F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second;
-      if (VScaleMax > 0)
-        MaxNumVScale = VScaleMax;
-    }
-
-    return MaxNumVScale * VF.getKnownMinValue();
+    return VF.getKnownMinValue() * ST->getVScaleForTuning();
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF);

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index ccce7a5726303..b932a980c9aa9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -1,32 +1,95 @@
-; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -analyze -cost-model < %s | FileCheck %s
+; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1
+; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1
 
-; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
+target triple="aarch64--linux-gnu"
 
-define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {
+define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-LABEL: 'masked_gathers'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction:   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction:   %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction:   %res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128
-; CHECK-NEXT: Cost Model: Invalid cost for instruction:   %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_gathers'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_gathers'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
   %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-  %res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> undef, i32 0, <4 x i1> %v4i1mask, <4 x i32> zeroinitializer)
-  %res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*> undef, i32 0, <1 x i1> %v1i1mask, <1 x i128> zeroinitializer)
-  %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+  %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
   ret void
 }
 
-define void @masked_gathers_no_vscale_range() {
+define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
+; CHECK-LABEL: 'masked_gathers_tune_generic'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_gathers_tune_generic'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_gathers_tune_generic'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+  %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+  %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+  ret void
+}
+
+define void @masked_gathers_no_vscale_range() #2 {
 ; CHECK-LABEL: 'masked_gathers_no_vscale_range'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_gathers_no_vscale_range'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_gathers_no_vscale_range'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0i16(<vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
   %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
 
@@ -41,11 +104,13 @@ define void @masked_gathers_no_vscale_range() {
   ret void
 }
 
+attributes #0 = { "target-features"="+sve" vscale_range(0, 8) }
+attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" }
+attributes #2 = { "target-features"="+sve" }
+
 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
-declare <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*>, i32, <1 x i1>, <1 x i128>)
-declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x i64*>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
 declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x double*>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
 declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
 declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x float*>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 19c2539d1bf4c..7c4a46c1343bc 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -80,8 +80,8 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
 
 define void @strict_fp_reductions(<vscale x 4 x float> %v0, <vscale x 4 x double> %v1) {
 ; CHECK-LABEL: 'strict_fp_reductions'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v0)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index 4abf8c538868c..a3454084e5f5e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -1,32 +1,95 @@
-; Check getIntrinsicInstrCost in BasicTTIImpl.h with for  masked scatter
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -analyze -cost-model < %s | FileCheck %s
+; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1
+; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1
 
-; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
+target triple="aarch64--linux-gnu"
 
-define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {
+define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-LABEL: 'masked_scatters'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32
-; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0i128
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_scatters'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_scatters'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
   call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-  call void @llvm.masked.scatter.v4i32(<4 x i32> undef, <4 x i32*> undef, i32 0, <4 x i1> %v4i1mask)
-  call void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128> undef, <1 x i128*> undef, i32 0, <1 x i1> %v1i1mask)
-  call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+  call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
   ret void
 }
 
-define void @masked_scatters_no_vscale_range() {
+define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
+; CHECK-LABEL: 'masked_scatters_tune_generic'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_scatters_tune_generic'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_scatters_tune_generic'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32*> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+  call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32*> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
+  call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64*> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+  ret void
+}
+
+define void @masked_scatters_no_vscale_range() #2 {
 ; CHECK-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'masked_scatters_no_vscale_range'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-1-LABEL: 'masked_scatters_no_vscale_range'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32(<vscale x 8 x float> undef, <vscale x 8 x float*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> undef, <vscale x 4 x float*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32(<vscale x 2 x float> undef, <vscale x 2 x float*> undef, i32 1, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16(<vscale x 16 x i16> undef, <vscale x 16 x i16*> undef, i32 1, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16(<vscale x 8 x i16> undef, <vscale x 8 x i16*> undef, i32 1, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> undef, <vscale x 4 x i16*> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double*> undef, i32 1, <vscale x 4 x i1> undef)
   call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double*> undef, i32 1, <vscale x 2 x i1> undef)
 
@@ -41,11 +104,13 @@ define void @masked_scatters_no_vscale_range() {
   ret void
 }
 
+attributes #0 = { "target-features"="+sve" vscale_range(0, 8) }
+attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" }
+attributes #2 = { "target-features"="+sve" }
+
 declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32*>, i32, <vscale x 8 x i1>)
-declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
-declare void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128>, <1 x i128*>, i32, <1 x i1>)
-declare void @llvm.masked.scatter.nxv1i64.nxv1p0i64(<vscale x 1 x i64>, <vscale x 1 x i64*>, i32, <vscale x 1 x i1>)
+declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64*>, i32, <vscale x 1 x i1>)
 declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double*>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
 declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float*>, i32, <vscale x 8 x i1>)

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 0221c890a6e1b..89162d238f8cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -3,11 +3,14 @@
 ; RUN:   -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4
 ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \
 ; RUN:   -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8
+; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \
+; RUN:   -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -mcpu=neoverse-n2 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4-CPU-NEOVERSE-N2
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 {
 entry:
@@ -28,8 +31,9 @@ for.end:
 }
 
 
-; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 {
 entry:


        


More information about the llvm-commits mailing list