[llvm] [AArch64] Add getVectorInstrCost Codesize costs handling. (PR #130946)

Wed Mar 12 04:50:07 PDT 2025

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/130946

We have a lot of missing Codesize costs for vector operations. This patch starts things off by adding codesize costs for getVectorInstrCost, returning a single cost instead of the VectorInsertExtractBaseCost (which is typically 2). Insert of a load are given a cost of 0 as they use ld1, otherwise the cost is 1.

>From 5e3db3c093ea73a926783d9ca408e4df1a809c3c Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 12 Mar 2025 09:19:19 +0000
Subject: [PATCH] [AArch64] Add getVectorInstrCost Codesize costs handling.

We have a lot of missing Codesize costs for vector operations. This patch
starts things off by adding codesize costs for getVectorInstrCost, returning a
single cost instead of the VectorInsertExtractBaseCost (which is typically 2).
Insert of a load are given a cost of 0 as they use ld1, otherwise the cost
is 1.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 23 ++++---
 .../AArch64/AArch64TargetTransformInfo.h      |  4 +-
 .../Analysis/CostModel/AArch64/arith-fp.ll    | 12 ++--
 .../CostModel/AArch64/insert-extract.ll       | 60 +++++++++----------
 .../Analysis/CostModel/AArch64/min-max.ll     | 32 +++++-----
 .../Analysis/CostModel/AArch64/reduce-fadd.ll | 40 ++++++-------
 .../CostModel/AArch64/shuffle-load.ll         | 14 ++---
 .../CostModel/AArch64/sve-intrinsics.ll       | 56 ++++++++---------
 8 files changed, 123 insertions(+), 118 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7cec8a17dfaaa..70f78cbcd6119 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3302,8 +3302,8 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
-    unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
-    const Instruction *I, Value *Scalar,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    bool HasRealUse, const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -3336,12 +3336,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     // and its second operand is a load, then we will generate a LD1, which
     // are expensive instructions.
     if (I && dyn_cast<LoadInst>(I->getOperand(1)))
-      return ST->getVectorInsertExtractBaseCost() + 1;
+      return CostKind == TTI::TCK_CodeSize
+                 ? 0
+                 : ST->getVectorInsertExtractBaseCost() + 1;
 
     // i1 inserts and extract will include an extra cset or cmp of the vector
     // value. Increase the cost by 1 to account.
     if (Val->getScalarSizeInBits() == 1)
-      return ST->getVectorInsertExtractBaseCost() + 1;
+      return CostKind == TTI::TCK_CodeSize
+                 ? 2
+                 : ST->getVectorInsertExtractBaseCost() + 1;
 
     // FIXME:
     // If the extract-element and insert-element instructions could be
@@ -3465,7 +3469,8 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     return 0;
 
   // All other insert/extracts cost this much.
-  return ST->getVectorInsertExtractBaseCost();
+  return CostKind == TTI::TCK_CodeSize ? 1
+                                       : ST->getVectorInsertExtractBaseCost();
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -3474,22 +3479,22 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    Value *Op1) {
   bool HasRealUse =
       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
-  return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
-  return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
-                                  ScalarUserAndIdx);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
+                                  Scalar, ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index) {
-  return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
+  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
                                   true /* HasRealUse */, &I);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 8a3fd11705640..a5db213cf1f84 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,8 +73,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
-      unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
-      const Instruction *I = nullptr, Value *Scalar = nullptr,
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {});
 
 public:
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index 1efd41426c4ef..de1b39db1539c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -536,9 +536,9 @@ define void @fsqrt() {
 define void @fsqrt_fp16() {
 ; CHECK-BASE-LABEL: 'fsqrt_fp16'
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.sqrt.f16(half undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 10 for: %V4F16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 22 for: %V8F16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 44 for: %V16F16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4F16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8F16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16F16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fsqrt_fp16'
@@ -679,9 +679,9 @@ define void @fma() {
 define void @fma_fp16() {
 ; CHECK-BASE-LABEL: 'fma_fp16'
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 10 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 22 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-BASE-NEXT:  Cost Model: Found costs of 44 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fma_fp16'
diff --git a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
index 3baeb075f1813..cef6cf1a081e6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
@@ -11,38 +11,38 @@ target triple = "aarch64--linux-gnu"
 
 define void @vectorInstrCost() {
 ; CHECK-LABEL: 'vectorInstrCost'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %ta0 = extractelement <8 x i1> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %ta1 = extractelement <8 x i1> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t1 = extractelement <8 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t2 = extractelement <8 x i8> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t3 = extractelement <4 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t4 = extractelement <4 x i16> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t5 = extractelement <2 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t6 = extractelement <2 x i32> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t7 = extractelement <2 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t8 = extractelement <2 x i64> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %ta0 = extractelement <8 x i1> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %ta1 = extractelement <8 x i1> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t1 = extractelement <8 x i8> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t2 = extractelement <8 x i8> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t3 = extractelement <4 x i16> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t4 = extractelement <4 x i16> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t5 = extractelement <2 x i32> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t6 = extractelement <2 x i32> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t7 = extractelement <2 x i64> undef, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t8 = extractelement <2 x i64> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t9 = extractelement <4 x half> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t10 = extractelement <4 x half> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t10 = extractelement <4 x half> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t11 = extractelement <2 x float> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t12 = extractelement <2 x float> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t12 = extractelement <2 x float> undef, i32 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t13 = extractelement <2 x double> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t14 = extractelement <2 x double> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t14 = extractelement <2 x double> undef, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %t31 = insertelement <8 x i1> undef, i1 false, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %t41 = insertelement <8 x i1> undef, i1 true, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t30 = insertelement <8 x i8> undef, i8 0, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t40 = insertelement <8 x i8> undef, i8 1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t50 = insertelement <4 x i16> undef, i16 2, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t60 = insertelement <4 x i16> undef, i16 3, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t70 = insertelement <2 x i32> undef, i32 4, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t80 = insertelement <2 x i32> undef, i32 5, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t90 = insertelement <2 x i64> undef, i64 6, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t100 = insertelement <2 x i64> undef, i64 7, i32 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %ta0 = extractelement <8 x i1> undef, i32 0
@@ -86,7 +86,7 @@ define void @vectorInstrCost() {
 define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_B'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %i, align 1
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %v2
 ;
 entry:
@@ -98,7 +98,7 @@ entry:
 define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_H'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %i, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %v2
 ;
 entry:
@@ -110,7 +110,7 @@ entry:
 define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_W'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %i, align 4
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %v2
 ;
 entry:
@@ -122,7 +122,7 @@ entry:
 define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
 ; CHECK-LABEL: 'LD1_X'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i64, ptr %i, align 8
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %v2
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/min-max.ll b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
index ed4b9dd2ba571..b824f5309adc1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/min-max.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
@@ -195,10 +195,10 @@ define void @smax() {
 define void @minnum16() {
 ; CHECK-NOF16-LABEL: 'minnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of 1 for: %f16 = call half @llvm.minnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 4 for: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 10 for: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 22 for: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 44 for: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'minnum16'
@@ -220,10 +220,10 @@ define void @minnum16() {
 define void @maxnum16() {
 ; CHECK-NOF16-LABEL: 'maxnum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of 1 for: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 4 for: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 10 for: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 22 for: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 44 for: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'maxnum16'
@@ -288,10 +288,10 @@ define void @maxnum() {
 define void @minimum16() {
 ; CHECK-NOF16-LABEL: 'minimum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of 1 for: %f16 = call half @llvm.minimum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 4 for: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 10 for: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 22 for: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 44 for: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'minimum16'
@@ -313,10 +313,10 @@ define void @minimum16() {
 define void @maximum16() {
 ; CHECK-NOF16-LABEL: 'maximum16'
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of 1 for: %f16 = call half @llvm.maximum.f16(half undef, half undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 4 for: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 10 for: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 22 for: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found costs of 44 for: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:30 Lat:44 SizeLat:44 for: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'maximum16'
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 32a12d83e5534..f565924a325a3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -7,11 +7,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @strict_fp_reductions() {
 ; CHECK-LABEL: 'strict_fp_reductions'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:44 SizeLat:28 for: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:12 for: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:10 SizeLat:6 for: %fadd_v2f32 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:22 Lat:44 SizeLat:28 for: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:10 SizeLat:6 for: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:20 SizeLat:12 for: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:8 Lat:16 SizeLat:8 for: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
@@ -26,24 +26,24 @@ define void @strict_fp_reductions() {
 
 define void @strict_fp_reductions_fp16() {
 ; CHECK-NOFP16-LABEL: 'strict_fp_reductions_fp16'
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:30 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:60 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:46 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'strict_fp_reductions_fp16'
-; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:60 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:23 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:46 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-BF16-LABEL: 'strict_fp_reductions_fp16'
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:30 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:60 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:10 SizeLat:6 for: %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:46 SizeLat:30 for: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:46 Lat:92 SizeLat:60 for: %fadd_v16f16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fadd_v2f16 = call half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
@@ -55,15 +55,15 @@ define void @strict_fp_reductions_fp16() {
 
 define void @strict_fp_reductions_bf16() {
 ; CHECK-NOFP16-LABEL: 'strict_fp_reductions_bf16'
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-F16-LABEL: 'strict_fp_reductions_bf16'
-; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; CHECK-F16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-BF16-LABEL: 'strict_fp_reductions_bf16'
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:11 Lat:22 SizeLat:14 for: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4f8(bfloat 0.0, <4 x bfloat> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
index 156d85b5b100f..068fffb68c85e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
@@ -187,7 +187,7 @@ entry:
 define <8 x i8> @ld1r_8b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %lane
 ;
@@ -201,7 +201,7 @@ entry:
 define <16 x i8> @ld1r_16b_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_16b_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i8, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %lane
 ;
@@ -215,7 +215,7 @@ entry:
 define <4 x i16> @ld1r_4h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %lane
 ;
@@ -229,7 +229,7 @@ entry:
 define <8 x i16> @ld1r_8h_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_8h_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i16, ptr %x, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %lane
 ;
@@ -243,7 +243,7 @@ entry:
 define <2 x i32> @ld1r_2s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %lane
 ;
@@ -257,7 +257,7 @@ entry:
 define <4 x i32> @ld1r_4s_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_4s_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i32, ptr %x, align 4
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lane
 ;
@@ -271,7 +271,7 @@ entry:
 define <2 x i64> @ld1r_2d_int_shuff(ptr nocapture %x) {
 ; CHECK-LABEL: 'ld1r_2d_int_shuff'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %tmp = load i64, ptr %x, align 8
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %lane
 ;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 09fd1de99d6c1..308a3785c9f05 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -44,9 +44,9 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -56,9 +56,9 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -101,9 +101,9 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -113,9 +113,9 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -1364,34 +1364,34 @@ define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;