[llvm] [AArch64] Improve cost model for legal subvec insert/extract (PR #81135)

Mon Mar 4 07:22:31 PST 2024

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/81135

>From a3030ee7a233d9195cabd6d4ad6aec36e2c9c0a2 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 7 Feb 2024 15:27:41 +0000
Subject: [PATCH 1/6] [AArch64] Don't model legal subvector insert/extract as
 scalarization

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 42 +++++++++++++++++++
 .../CostModel/AArch64/sve-intrinsics.ll       |  8 ++--
 .../AArch64/scalable-vec-ins-ext.ll           | 32 +++++++-------
 3 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 010e569809e276..5a2a3e8b8cd256 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -568,6 +568,48 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     return Cost;
   }
+  case Intrinsic::vector_extract: {
+    // If both the vector argument and the return type are legal types and the
+    // index is 0, then this should be a no-op or simple operation; return a
+    // relatively low cost.
+
+    // If arguments aren't actually supplied, then we cannot determine the
+    // value of the index.
+    if (ICA.getArgs().size() < 2)
+      break;
+    LLVMContext &C = RetTy->getContext();
+    EVT MRTy = getTLI()->getValueType(DL, RetTy);
+    EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+    TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[1]);
+    if (RLK.first == TargetLoweringBase::TypeLegal &&
+        PLK.first == TargetLoweringBase::TypeLegal && Idx &&
+        Idx->getZExtValue() == 0)
+      return InstructionCost(1);
+    break;
+  }
+  case Intrinsic::vector_insert: {
+    // If both the vector and subvector arguments are legal types and the index
+    // is 0, then this should be a no-op or simple operation; return a
+    // relatively low cost.
+
+    // If arguments aren't actually supplied, then we cannot determine the
+    // value of the index.
+    if (ICA.getArgs().size() < 3)
+      break;
+    LLVMContext &C = RetTy->getContext();
+    EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
+    TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[2]);
+    if (LK0.first == TargetLoweringBase::TypeLegal &&
+        LK1.first == TargetLoweringBase::TypeLegal && Idx &&
+        Idx->getZExtValue() == 0)
+      return InstructionCost(1);
+    break;
+  }
   case Intrinsic::bitreverse: {
     static const CostTblEntry BitreverseTbl[] = {
         {Intrinsic::bitreverse, MVT::i32, 1},
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 025382878d9931..9eac3dae26bab2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -32,8 +32,8 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
@@ -61,8 +61,8 @@ declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>
 
 define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
index 1d5bc84c276357..cf670f5f4f2753 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -12,26 +12,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
 ; CHECK-LABEL: define void @test_ins_ext_cost(
 ; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
 ; CHECK-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
 ; CHECK-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
 ; CHECK-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
 ; CHECK-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
+; CHECK-NEXT:    [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
+; CHECK-NEXT:    [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
+; CHECK-NEXT:    [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
+; CHECK-NEXT:    [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
+; CHECK-NEXT:    [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
+; CHECK-NEXT:    [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
+; CHECK-NEXT:    [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
+; CHECK-NEXT:    [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
+; CHECK-NEXT:    [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
+; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:

>From b5b9931747978eab32e34930fd80c575eacb9618 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 19 Feb 2024 14:45:47 +0000
Subject: [PATCH 2/6] Skip predicate types, improve arg count checks

---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5a2a3e8b8cd256..eadd325877a411 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -574,8 +574,9 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // relatively low cost.
 
     // If arguments aren't actually supplied, then we cannot determine the
-    // value of the index.
-    if (ICA.getArgs().size() < 2)
+    // value of the index. We also want to skip this on predicate types.
+    if (ICA.getArgs().size() != 2 ||
+        ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;
     LLVMContext &C = RetTy->getContext();
     EVT MRTy = getTLI()->getValueType(DL, RetTy);
@@ -586,7 +587,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (RLK.first == TargetLoweringBase::TypeLegal &&
         PLK.first == TargetLoweringBase::TypeLegal && Idx &&
         Idx->getZExtValue() == 0)
-      return InstructionCost(1);
+      return TTI::TCC_Basic;
     break;
   }
   case Intrinsic::vector_insert: {
@@ -595,8 +596,9 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // relatively low cost.
 
     // If arguments aren't actually supplied, then we cannot determine the
-    // value of the index.
-    if (ICA.getArgs().size() < 3)
+    // value of the index. We also want to skip this on predicate types.
+    if (ICA.getArgs().size() != 3 ||
+        ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;
     LLVMContext &C = RetTy->getContext();
     EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
@@ -607,7 +609,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (LK0.first == TargetLoweringBase::TypeLegal &&
         LK1.first == TargetLoweringBase::TypeLegal && Idx &&
         Idx->getZExtValue() == 0)
-      return InstructionCost(1);
+      return TTI::TCC_Basic;
     break;
   }
   case Intrinsic::bitreverse: {

>From 37b3bbcdecc4a91173d5ad80cae76fcdf4a7373a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 29 Feb 2024 09:23:04 +0000
Subject: [PATCH 3/6] Skip unpacked SVE types

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 18 +++++++++++++--
 .../CostModel/AArch64/sve-intrinsics.ll       | 22 +++++++++++++++----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index eadd325877a411..60215848a848d0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -581,13 +581,20 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     LLVMContext &C = RetTy->getContext();
     EVT MRTy = getTLI()->getValueType(DL, RetTy);
     EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    // Skip this if either the return type or the vector argument are unpacked
+    // SVE types; they may get lowered to stack stores and loads.
+    if ((MRTy.isScalableVector() &&
+         MRTy.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) ||
+        (MPTy.isScalableVector() &&
+         MPTy.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock))
+      break;
     TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
     TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
     const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[1]);
     if (RLK.first == TargetLoweringBase::TypeLegal &&
         PLK.first == TargetLoweringBase::TypeLegal && Idx &&
         Idx->getZExtValue() == 0)
-      return TTI::TCC_Basic;
+      return TTI::TCC_Free;
     break;
   }
   case Intrinsic::vector_insert: {
@@ -603,13 +610,20 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     LLVMContext &C = RetTy->getContext();
     EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
     EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    // Skip this if either type is an unpacked SVE type; they may get lowered
+    // to stack stores and loads.
+    if ((MTy0.isScalableVector() &&
+         MTy0.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) ||
+        (MTy1.isScalableVector() &&
+         MTy1.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock))
+      break;
     TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
     TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
     const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[2]);
     if (LK0.first == TargetLoweringBase::TypeLegal &&
         LK1.first == TargetLoweringBase::TypeLegal && Idx &&
         Idx->getZExtValue() == 0)
-      return TTI::TCC_Basic;
+      return TTI::TCC_Free;
     break;
   }
   case Intrinsic::bitreverse: {
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 9eac3dae26bab2..1f79a4a46f1631 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -32,11 +32,13 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
@@ -45,6 +47,8 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
@@ -52,20 +56,26 @@ define void @vector_insert_extract_idxzero_128b() #1 {
   %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
   %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
   %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
   ret void
 }
 declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
 declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i64)
 declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i64)
 declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
+declare <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float>, <2 x float>, i64)
+declare <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half>, i64)
 
 define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
@@ -74,6 +84,8 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
@@ -81,6 +93,8 @@ define void @vector_insert_extract_idxzero_256b() #2 {
   %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
   %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
   %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
+  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
   ret void
 }
 declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)

>From 53cac3688585855551e4b0fd127e99f9d1174ec0 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 4 Mar 2024 09:51:12 +0000
Subject: [PATCH 4/6] Improve naming, add extra test

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 35 ++++++++++---------
 .../CostModel/AArch64/sve-intrinsics.ll       | 14 ++++++++
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 60215848a848d0..d23d50ec97d1e9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -500,6 +500,11 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
+static bool isUnpackedVectorVT(EVT VecVT) {
+  return VecVT.isScalableVector() &&
+         VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
@@ -579,17 +584,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;
     LLVMContext &C = RetTy->getContext();
-    EVT MRTy = getTLI()->getValueType(DL, RetTy);
-    EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    EVT RetVT = getTLI()->getValueType(DL, RetTy);
+    EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
     // Skip this if either the return type or the vector argument are unpacked
     // SVE types; they may get lowered to stack stores and loads.
-    if ((MRTy.isScalableVector() &&
-         MRTy.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) ||
-        (MPTy.isScalableVector() &&
-         MPTy.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock))
+    if (isUnpackedVectorVT(RetVT) || isUnpackedVectorVT(VecVT))
       break;
-    TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
-    TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    TargetLoweringBase::LegalizeKind RLK =
+        getTLI()->getTypeConversion(C, RetVT);
+    TargetLoweringBase::LegalizeKind PLK =
+        getTLI()->getTypeConversion(C, VecVT);
     const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[1]);
     if (RLK.first == TargetLoweringBase::TypeLegal &&
         PLK.first == TargetLoweringBase::TypeLegal && Idx &&
@@ -608,17 +612,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;
     LLVMContext &C = RetTy->getContext();
-    EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
-    EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    EVT SubVecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
     // Skip this if either type is an unpacked SVE type; they may get lowered
     // to stack stores and loads.
-    if ((MTy0.isScalableVector() &&
-         MTy0.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) ||
-        (MTy1.isScalableVector() &&
-         MTy1.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock))
+    if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
       break;
-    TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
-    TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+    TargetLoweringBase::LegalizeKind LK0 =
+        getTLI()->getTypeConversion(C, VecVT);
+    TargetLoweringBase::LegalizeKind LK1 =
+        getTLI()->getTypeConversion(C, SubVecVT);
     const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[2]);
     if (LK0.first == TargetLoweringBase::TypeLegal &&
         LK1.first == TargetLoweringBase::TypeLegal && Idx &&
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 1f79a4a46f1631..937c383aedc781 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -39,6 +39,8 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
@@ -49,6 +51,8 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
@@ -58,6 +62,8 @@ define void @vector_insert_extract_idxzero_128b() #1 {
   %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
   %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
   %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
   ret void
 }
 declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
@@ -66,6 +72,8 @@ declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1>,
 declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
 declare <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float>, <2 x float>, i64)
 declare <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
+declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64)
 
 define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-LABEL: 'vector_insert_extract_idxzero_256b'
@@ -76,6 +84,8 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
@@ -86,6 +96,8 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
@@ -95,6 +107,8 @@ define void @vector_insert_extract_idxzero_256b() #2 {
   %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
   %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
   %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
+  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
+  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
   ret void
 }
 declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)

>From 69f866fae5a180bc1d81455589fa9dc1d6ca579f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 4 Mar 2024 12:43:50 +0000
Subject: [PATCH 5/6] Unify insert/extract, more renaming

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 59 ++++++-------------
 1 file changed, 18 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d23d50ec97d1e9..cee157b2dff18b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -573,59 +573,36 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     return Cost;
   }
-  case Intrinsic::vector_extract: {
-    // If both the vector argument and the return type are legal types and the
-    // index is 0, then this should be a no-op or simple operation; return a
-    // relatively low cost.
-
-    // If arguments aren't actually supplied, then we cannot determine the
-    // value of the index. We also want to skip this on predicate types.
-    if (ICA.getArgs().size() != 2 ||
-        ICA.getReturnType()->getScalarType()->isIntegerTy(1))
-      break;
-    LLVMContext &C = RetTy->getContext();
-    EVT RetVT = getTLI()->getValueType(DL, RetTy);
-    EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
-    // Skip this if either the return type or the vector argument are unpacked
-    // SVE types; they may get lowered to stack stores and loads.
-    if (isUnpackedVectorVT(RetVT) || isUnpackedVectorVT(VecVT))
-      break;
-    TargetLoweringBase::LegalizeKind RLK =
-        getTLI()->getTypeConversion(C, RetVT);
-    TargetLoweringBase::LegalizeKind PLK =
-        getTLI()->getTypeConversion(C, VecVT);
-    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[1]);
-    if (RLK.first == TargetLoweringBase::TypeLegal &&
-        PLK.first == TargetLoweringBase::TypeLegal && Idx &&
-        Idx->getZExtValue() == 0)
-      return TTI::TCC_Free;
-    break;
-  }
+  case Intrinsic::vector_extract:
   case Intrinsic::vector_insert: {
-    // If both the vector and subvector arguments are legal types and the index
+    // If both the vector and subvector types are legal types and the index
     // is 0, then this should be a no-op or simple operation; return a
     // relatively low cost.
 
     // If arguments aren't actually supplied, then we cannot determine the
-    // value of the index. We also want to skip this on predicate types.
-    if (ICA.getArgs().size() != 3 ||
+    // value of the index. We also want to skip predicate types.
+    if (ICA.isTypeBasedOnly() ||
         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;
+
     LLVMContext &C = RetTy->getContext();
     EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
-    EVT SubVecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
-    // Skip this if either type is an unpacked SVE type; they may get lowered
-    // to stack stores and loads.
+    bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
+    EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
+                             : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    // Skip this if either the vector or subvector types are unpacked
+    // SVE types; they may get lowered to stack stores and loads.
     if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
       break;
-    TargetLoweringBase::LegalizeKind LK0 =
-        getTLI()->getTypeConversion(C, VecVT);
-    TargetLoweringBase::LegalizeKind LK1 =
+
+    TargetLoweringBase::LegalizeKind SubVecLK =
         getTLI()->getTypeConversion(C, SubVecVT);
-    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[2]);
-    if (LK0.first == TargetLoweringBase::TypeLegal &&
-        LK1.first == TargetLoweringBase::TypeLegal && Idx &&
-        Idx->getZExtValue() == 0)
+    TargetLoweringBase::LegalizeKind VecLK =
+        getTLI()->getTypeConversion(C, VecVT);
+    const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
+    const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
+    if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
+        VecLK.first == TargetLoweringBase::TypeLegal && CIdx && CIdx->isZero())
       return TTI::TCC_Free;
     break;
   }

>From 595faf89c6de91d42c5b00e957776d53de07ab93 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 4 Mar 2024 15:20:14 +0000
Subject: [PATCH 6/6] Verify number of args matches number of argtypes

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cee157b2dff18b..48f053c39b1bdc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -581,7 +581,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
     // If arguments aren't actually supplied, then we cannot determine the
     // value of the index. We also want to skip predicate types.
-    if (ICA.isTypeBasedOnly() ||
+    if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
       break;