[llvm] [AArch64] Improve cost model for legal subvec insert/extract (PR #81135)

Fri Feb 9 08:14:46 PST 2024

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/81135

>From d25b58f0a0df5913a9f78d331b724864709a0423 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 7 Feb 2024 17:22:35 +0000
Subject: [PATCH 1/4] [NFC][AArch64] Precommit tests for guarding unrolling
 with scalable subvec ins/ext

---
 .../AArch64/scalable-vec-ins-ext.ll           | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
new file mode 100644
index 00000000000000..4e9885750008f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck %s -check-prefix=UNROLL-256
+
+;; This test contains IR similar to what would be generated when SVE ACLE
+;; routines are used with fixed-width vector types -- lots of subvector inserts
+;; and extracts that are effectively just bitcasts since the types are the
+;; same at a given SVE bit size. We want to make sure that they are not a
+;; barrier to unrolling simple loops with a fixed trip count which could be
+;; further optimized.
+
+define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) {
+; UNROLL-128-LABEL: define void @test_ins_ext_cost(
+; UNROLL-128-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-128-NEXT:  entry:
+; UNROLL-128-NEXT:    br label [[FOR_BODY:%.*]]
+; UNROLL-128:       for.body:
+; UNROLL-128-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-128-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-128-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-128-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-128-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-128-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-128-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-128-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-128-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-128-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-128-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-128:       exit:
+; UNROLL-128-NEXT:    ret void
+;
+; UNROLL-256-LABEL: define void @test_ins_ext_cost(
+; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-256-NEXT:  entry:
+; UNROLL-256-NEXT:    br label [[FOR_BODY:%.*]]
+; UNROLL-256:       for.body:
+; UNROLL-256-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-256-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-256-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-256-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-256-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-256-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-256-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-256-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-256-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-256-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-256:       exit:
+; UNROLL-256-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %exit.cond = phi i1 [ true, %entry ], [ false, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ 1, %for.body ]
+  %gep.a = getelementptr inbounds <8 x float>, ptr %a, i64 %iv
+  %load.a = load <8 x float>, ptr %gep.a, align 16
+  %gep.b = getelementptr inbounds <8 x float>, ptr %b, i64 %iv
+  %load.b = load <8 x float>, ptr %gep.b, align 16
+  %gep.c = getelementptr inbounds <8 x float>, ptr %c, i64 %iv
+  %load.c = load <8 x float>, ptr %gep.c, align 16
+  %cast.scalable.b = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.b, i64 0)
+  %cast.scalable.c = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.c, i64 0)
+  %add = fadd <vscale x 4 x float> %cast.scalable.b, %cast.scalable.c
+  %cast.scalable.a = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.a, i64 0)
+  %mul = fmul <vscale x 4 x float> %cast.scalable.a, %add
+  %cast.fixed.d = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %mul, i64 0)
+  %gep.d = getelementptr inbounds <8 x float>, ptr %d, i64 0, i64 %iv
+  store <8 x float> %cast.fixed.d, ptr %gep.d, align 16
+  br i1 %exit.cond, label %for.body, label %exit
+
+exit:
+  ret void
+}

>From 40471622babb1031de174dfacc110896826109bf Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 9 Feb 2024 12:08:39 +0000
Subject: [PATCH 2/4] Add direct cost model tests, remove negative case from
 unroll test

---
 .../CostModel/AArch64/sve-intrinsics.ll       | 37 +++++++++
 .../AArch64/scalable-vec-ins-ext.ll           | 80 +++++++------------
 2 files changed, 65 insertions(+), 52 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 3b25f4e7ccb945..ff1f52074aa087 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -30,6 +30,41 @@ declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32>
 declare <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32>, i64)
 declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32>, <vscale x 4 x i32>, i64)
 
+define void @vector_insert_extract_legal_idxzero_128b(<vscale x 4 x float> %v0, <4 x float> %v1, <vscale x 2 x double> %v2) #1 {
+; CHECK-LABEL: 'vector_insert_extract_legal_idxzero_128b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_legal_idxzero_128b'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0)
+  %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0)
+  ret void
+}
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i64)
+
+define void @vector_insert_extract_legal_idxzero_256b(<vscale x 8 x i16> %v0, <16 x i16> %v1, <vscale x 4 x float> %v2) #2 {
+; CHECK-LABEL: 'vector_insert_extract_legal_idxzero_256b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_legal_idxzero_256b'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %v2, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0)
+  %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> %v2, i64 0)
+  ret void
+}
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)
+declare <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float>, i64)
 
 define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale x 4 x float> %v2, <vscale x 4 x double> %v3) {
 ; CHECK-LABEL: 'reductions'
@@ -864,3 +899,5 @@ declare void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x ptr> %ptrs, i32 %a
 declare void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 %align, <1 x i1> %masks)
 
 attributes #0 = { "target-features"="+sve,+bf16" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
+attributes #2 = { "target-features"="+sve" vscale_range(2, 16) }
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
index 4e9885750008f8..1d5bc84c276357 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128
-; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck %s -check-prefix=UNROLL-256
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 %s | FileCheck %s
 
 ;; This test contains IR similar to what would be generated when SVE ACLE
 ;; routines are used with fixed-width vector types -- lots of subvector inserts
@@ -9,56 +8,31 @@
 ;; barrier to unrolling simple loops with a fixed trip count which could be
 ;; further optimized.
 
-define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) {
-; UNROLL-128-LABEL: define void @test_ins_ext_cost(
-; UNROLL-128-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; UNROLL-128-NEXT:  entry:
-; UNROLL-128-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-128:       for.body:
-; UNROLL-128-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; UNROLL-128-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; UNROLL-128-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; UNROLL-128-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; UNROLL-128-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; UNROLL-128-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; UNROLL-128-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; UNROLL-128-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
-; UNROLL-128-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
-; UNROLL-128-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
-; UNROLL-128-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
-; UNROLL-128-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
-; UNROLL-128-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
-; UNROLL-128-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; UNROLL-128-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; UNROLL-128-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; UNROLL-128-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; UNROLL-128:       exit:
-; UNROLL-128-NEXT:    ret void
-;
-; UNROLL-256-LABEL: define void @test_ins_ext_cost(
-; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; UNROLL-256-NEXT:  entry:
-; UNROLL-256-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-256:       for.body:
-; UNROLL-256-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; UNROLL-256-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; UNROLL-256-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; UNROLL-256-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; UNROLL-256-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
-; UNROLL-256-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
-; UNROLL-256-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
-; UNROLL-256-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
-; UNROLL-256-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
-; UNROLL-256-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
-; UNROLL-256-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; UNROLL-256-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; UNROLL-256-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; UNROLL-256:       exit:
-; UNROLL-256-NEXT:    ret void
+define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) #0 {
+; CHECK-LABEL: define void @test_ins_ext_cost(
+; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; CHECK-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; CHECK-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; CHECK-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; CHECK-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; CHECK-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -85,3 +59,5 @@ for.body:
 exit:
   ret void
 }
+
+attributes #0 = { "target-features"="+sve" vscale_range(2, 16) }

>From f8b12f702cc69f78f1b1c72fcd4f7414fc294d72 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 7 Feb 2024 15:27:41 +0000
Subject: [PATCH 3/4] [AArch64] Don't model legal subvector insert/extract as
 scalarization

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c9..0add50e5067886 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -568,6 +568,32 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     return Cost;
   }
+  case Intrinsic::vector_extract: {
+    // If both the vector argument and the return type are legal types, then
+    // this should be a no-op or simple operation; return a relatively low cost.
+    LLVMContext &C = RetTy->getContext();
+    EVT MRTy = getTLI()->getValueType(DL, RetTy);
+    EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+    TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    if (RLK.first == TargetLoweringBase::TypeLegal &&
+        PLK.first == TargetLoweringBase::TypeLegal)
+      return InstructionCost(1);
+    break;
+  }
+  case Intrinsic::vector_insert: {
+    // If both the vector and subvector arguments are legal types, then this
+    // should be a no-op or simple operation; return a relatively low cost.
+    LLVMContext &C = RetTy->getContext();
+    EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
+    TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+    if (LK0.first == TargetLoweringBase::TypeLegal &&
+        LK1.first == TargetLoweringBase::TypeLegal)
+      return InstructionCost(1);
+    break;
+  }
   case Intrinsic::bitreverse: {
     static const CostTblEntry BitreverseTbl[] = {
         {Intrinsic::bitreverse, MVT::i32, 1},

>From 521921e01fbe732203427b238e4f70668c823be2 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 9 Feb 2024 16:11:12 +0000
Subject: [PATCH 4/4] Check that the insert/extract index is 0 before providing
 a low cost

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 28 ++++++++++++----
 .../CostModel/AArch64/sve-intrinsics.ll       |  8 ++---
 .../AArch64/scalable-vec-ins-ext.ll           | 32 +++++++++++--------
 3 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0add50e5067886..c873df7c98fa97 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -569,28 +569,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return Cost;
   }
   case Intrinsic::vector_extract: {
-    // If both the vector argument and the return type are legal types, then
-    // this should be a no-op or simple operation; return a relatively low cost.
+    // If both the vector argument and the return type are legal types and the
+    // index is 0, then this should be a no-op or simple operation; return a
+    // relatively low cost.
+
+    // If arguments aren't actually supplied, then we cannot determine the
+    // value of the index.
+    if (ICA.getArgs().size() < 2)
+      break;
     LLVMContext &C = RetTy->getContext();
     EVT MRTy = getTLI()->getValueType(DL, RetTy);
     EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
     TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
     TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[1]);
     if (RLK.first == TargetLoweringBase::TypeLegal &&
-        PLK.first == TargetLoweringBase::TypeLegal)
+        PLK.first == TargetLoweringBase::TypeLegal && Idx &&
+        Idx->getZExtValue() == 0)
       return InstructionCost(1);
     break;
   }
   case Intrinsic::vector_insert: {
-    // If both the vector and subvector arguments are legal types, then this
-    // should be a no-op or simple operation; return a relatively low cost.
+    // If both the vector and subvector arguments are legal types and the index
+    // is 0, then this should be a no-op or simple operation; return a
+    // relatively low cost.
+
+    // If arguments aren't actually supplied, then we cannot determine the
+    // value of the index.
+    if (ICA.getArgs().size() < 3)
+      break;
     LLVMContext &C = RetTy->getContext();
     EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
     EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
     TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
     TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+    const ConstantInt *Idx = dyn_cast<ConstantInt>(ICA.getArgs()[2]);
     if (LK0.first == TargetLoweringBase::TypeLegal &&
-        LK1.first == TargetLoweringBase::TypeLegal)
+        LK1.first == TargetLoweringBase::TypeLegal && Idx &&
+        Idx->getZExtValue() == 0)
       return InstructionCost(1);
     break;
   }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ff1f52074aa087..6d712e8aa62208 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -32,8 +32,8 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_insert_extract_legal_idxzero_128b(<vscale x 4 x float> %v0, <4 x float> %v1, <vscale x 2 x double> %v2) #1 {
 ; CHECK-LABEL: 'vector_insert_extract_legal_idxzero_128b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> %v0, <4 x float> %v1, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %v2, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_legal_idxzero_128b'
@@ -50,8 +50,8 @@ declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i
 
 define void @vector_insert_extract_legal_idxzero_256b(<vscale x 8 x i16> %v0, <16 x i16> %v1, <vscale x 4 x float> %v2) #2 {
 ; CHECK-LABEL: 'vector_insert_extract_legal_idxzero_256b'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %v2, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> %v0, <16 x i16> %v1, i64 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %v2, i64 0)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_legal_idxzero_256b'
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
index 1d5bc84c276357..cf670f5f4f2753 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -12,26 +12,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
 ; CHECK-LABEL: define void @test_ins_ext_cost(
 ; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
 ; CHECK-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
 ; CHECK-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
 ; CHECK-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
 ; CHECK-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK:       exit:
+; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
+; CHECK-NEXT:    [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
+; CHECK-NEXT:    [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
+; CHECK-NEXT:    [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
+; CHECK-NEXT:    [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
+; CHECK-NEXT:    [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
+; CHECK-NEXT:    [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
+; CHECK-NEXT:    [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
+; CHECK-NEXT:    [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
+; CHECK-NEXT:    [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
+; CHECK-NEXT:    store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry: