[llvm] [LLVM][SVE] Honour calling convention when using SVE for fixed length vectors. (PR #70847)

Tue Nov 28 04:59:57 PST 2023

https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/70847

>From b8e060191e371aa7fbaf561fbfb7fb3a37174488 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Thu, 23 Nov 2023 14:44:58 +0000
Subject: [PATCH 1/2] [LLVM][SVE] Add tests to prove SVE does not affect the
 calling convention.

NOTE: When SVE is enable some of these tests trigger asserts.
---
 .../sve-fixed-length-function-calls.ll        | 243 ++++++++++++++++++
 .../AArch64/sve-fixed-length-functions.ll     | 211 +++++++++++++++
 2 files changed, 454 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
new file mode 100644
index 000000000000000..c34fcc59b3187b2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @foo_v32i8(<32 x i8>)
+define void @test_v32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v32i8
+  tail call void @foo_v32i8(<32 x i8> %a)
+  ret void
+}
+
+declare void @foo_v16i16(<16 x i16>)
+define void @test_v16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16i16
+  tail call void @foo_v16i16(<16 x i16> %a)
+  ret void
+}
+
+declare void @foo_v8i32(<8 x i32>)
+define void @test_v8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v8i32
+  tail call void @foo_v8i32(<8 x i32> %a)
+  ret void
+}
+
+declare void @foo_v4i64(<4 x i64>)
+define void @test_v4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v4i64
+  tail call void @foo_v4i64(<4 x i64> %a)
+  ret void
+}
+
+declare void @foo_v16f16(<16 x half>)
+define void @test_v16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16f16
+  tail call void @foo_v16f16(<16 x half> %a)
+  ret void
+}
+
+declare void @foo_v8f32(<8 x float>)
+define void @test_v8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v8f32
+  tail call void @foo_v8f32(<8 x float> %a)
+  ret void
+}
+
+declare void @foo_v4f64(<4 x double>)
+define void @test_v4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v4f64
+  tail call void @foo_v4f64(<4 x double> %a)
+  ret void
+}
+
+declare void @foo_v16bf16(<16 x bfloat>)
+define void @test_v16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_v16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16bf16
+  tail call void @foo_v16bf16(<16 x bfloat> %a)
+  ret void
+}
+
+declare void @foo_v3i64(<3 x i64>)
+define void @test_v3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_v3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    b foo_v3i64
+  tail call void @foo_v3i64(<3 x i64> %a)
+  ret void
+}
+
+declare void @foo_v5i64(<5 x i64>)
+define void @test_v5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_v5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    b foo_v5i64
+  tail call void @foo_v5i64(<5 x i64> %a)
+  ret void
+}
+
+declare void @foo_v1i16(<1 x i16>)
+define void @test_v1i16(<1 x i16> %unused, <1 x i16> %a) #0 {
+; CHECK-LABEL: test_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    b foo_v1i16
+  tail call void @foo_v1i16(<1 x i16> %a)
+  ret void
+}
+
+declare void @foo_v9i16(<9 x i16>)
+define void @test_v9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_v9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [sp, #8]
+; CHECK-NEXT:    ldr w1, [sp, #16]
+; CHECK-NEXT:    ldr w2, [sp, #24]
+; CHECK-NEXT:    ldr w3, [sp, #32]
+; CHECK-NEXT:    ldr w4, [sp, #40]
+; CHECK-NEXT:    ldr w5, [sp, #48]
+; CHECK-NEXT:    ldr w6, [sp, #56]
+; CHECK-NEXT:    ldr w7, [sp, #64]
+; CHECK-NEXT:    ldr w8, [sp, #72]
+; CHECK-NEXT:    str w8, [sp]
+; CHECK-NEXT:    b foo_v9i16
+  tail call void @foo_v9i16(<9 x i16> %a)
+  ret void
+}
+
+declare void @foo_v16i1(<16 x i1>)
+define void @test_v16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    b foo_v16i1
+  tail call void @foo_v16i1(<16 x i1> %a)
+  ret void
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+declare void @foo_v32i1(<32 x i1>)
+define void @test_v32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_v32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         b foo_v32i1
+  tail call void @foo_v32i1(<32 x i1> %a)
+  ret void
+}
+; UTC_ARGS: --enable
+
+declare void @foo_v1i128(<1 x i128>)
+define void @test_v1i128(<1 x i128> %unused, <1 x i128> %a) #0 {
+; CHECK-LABEL: test_v1i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    b foo_v1i128
+  tail call void @foo_v1i128(<1 x i128> %a)
+  ret void
+}
+
+declare void @foo_v2i128(<2 x i128>)
+define void @test_v2i128(<2 x i128> %unused, <2 x i128> %a) #0 {
+; CHECK-LABEL: test_v2i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    b foo_v2i128
+  tail call void @foo_v2i128(<2 x i128> %a)
+  ret void
+}
+
+declare void @foo_v1i256(<1 x i256>)
+define void @test_v1i256(<1 x i256> %unused, <1 x i256> %a) #0 {
+; CHECK-LABEL: test_v1i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    b foo_v1i256
+  tail call void @foo_v1i256(<1 x i256> %a)
+  ret void
+}
+
+declare void @foo_v2i256(<2 x i256>)
+define void @test_v2i256(<2 x i256> %unused, <2 x i256> %a) #0 {
+; CHECK-LABEL: test_v2i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x0, x1, [sp]
+; CHECK-NEXT:    ldp x2, x3, [sp, #16]
+; CHECK-NEXT:    ldp x4, x5, [sp, #32]
+; CHECK-NEXT:    ldp x6, x7, [sp, #48]
+; CHECK-NEXT:    b foo_v2i256
+  tail call void @foo_v2i256(<2 x i256> %a)
+  ret void
+}
+
+declare void @foo_v1f128(<1 x fp128>)
+define void @test_v1f128(<1 x fp128> %unused, <1 x fp128> %a) #0 {
+; CHECK-LABEL: test_v1f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    b foo_v1f128
+  tail call void @foo_v1f128(<1 x fp128> %a)
+  ret void
+}
+
+declare void @foo_v2f128(<2 x fp128>)
+define void @test_v2f128(<2 x fp128> %unused, <2 x fp128> %a) #0 {
+; CHECK-LABEL: test_v2f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v2f128
+  tail call void @foo_v2f128(<2 x fp128> %a)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve,+bf16" nounwind }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
new file mode 100644
index 000000000000000..98c17d78189c76a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
@@ -0,0 +1,211 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <32 x i8> @test_v32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <32 x i8> %a
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x i16> %a
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x i32> %a
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x i64> %a
+}
+
+define <16 x half> @test_v16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x half> %a
+}
+
+define <8 x float> @test_v8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x float> %a
+}
+
+define <4 x double> @test_v4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x double> %a
+}
+
+define <16 x bfloat> @test_v16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_v16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x bfloat> %a
+}
+
+define <3 x i64> @test_v3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_v3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    ret
+  ret <3 x i64> %a
+}
+
+define <5 x i64> @test_v5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_v5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    ret
+  ret <5 x i64> %a
+}
+
+define <1 x i16> @test_v1i16(<1 x i16> %unused, <1 x i16> %a) #0 {
+; CHECK-LABEL: test_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  ret <1 x i16> %a
+}
+
+define <9 x i16> @test_v9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_v9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [sp, #8]
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ld1 { v0.h }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1 { v0.h }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v0.h }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    ld1 { v0.h }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #56
+; CHECK-NEXT:    ld1 { v0.h }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v0.h }[7], [x9]
+; CHECK-NEXT:    ldrh w9, [sp, #72]
+; CHECK-NEXT:    strh w9, [x8, #16]
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    ret
+  ret <9 x i16> %a
+}
+
+define <16 x i1> @test_v16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  ret <16 x i1> %a
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+define <32 x i1> @test_v32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_v32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         ret
+  ret <32 x i1> %a
+}
+; UTC_ARGS: --enable
+
+define <1 x i128> @test_v1i128(<1 x i128> %unused, <1 x i128> %a) #0 {
+; CHECK-LABEL: test_v1i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+  ret <1 x i128> %a
+}
+
+define <2 x i128> @test_v2i128(<2 x i128> %unused, <2 x i128> %a) #0 {
+; CHECK-LABEL: test_v2i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    ret
+  ret <2 x i128> %a
+}
+
+define <1 x i256> @test_v1i256(<1 x i256> %unused, <1 x i256> %a) #0 {
+; CHECK-LABEL: test_v1i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    ret
+  ret <1 x i256> %a
+}
+
+define <2 x i256> @test_v2i256(<2 x i256> %unused, <2 x i256> %a) #0 {
+; CHECK-LABEL: test_v2i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x0, x1, [sp]
+; CHECK-NEXT:    ldp x2, x3, [sp, #16]
+; CHECK-NEXT:    ldp x4, x5, [sp, #32]
+; CHECK-NEXT:    ldp x6, x7, [sp, #48]
+; CHECK-NEXT:    ret
+  ret <2 x i256> %a
+}
+
+define <1 x fp128> @test_v1f128(<1 x fp128> %unused, <1 x fp128> %a) #0 {
+; CHECK-LABEL: test_v1f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  ret <1 x fp128> %a
+}
+
+define <2 x fp128> @test_v2f128(<2 x fp128> %unused, <2 x fp128> %a) #0 {
+; CHECK-LABEL: test_v2f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <2 x fp128> %a
+}
+
+attributes #0 = { "target-features"="+sve,+bf16" }

>From ea610b097133036468d59b7ebe05dce8c6a4bce2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 31 Oct 2023 17:44:31 +0000
Subject: [PATCH 2/2] [LLVM][SVE] Honour calling convention when using SVE for
 fixed length vectors.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 96 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 12 +++
 .../sve-fixed-length-function-calls.ll        |  2 +
 .../AArch64/sve-fixed-length-functions.ll     |  2 +
 4 files changed, 112 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 28e038abcecfc8e..69109f800254efb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26718,3 +26718,99 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
   return Subtarget->getMinimumJumpTableEntries();
 }
+
+MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                         CallingConv::ID CC,
+                                                         EVT VT) const {
+  bool NonUnitFixedLengthVector =
+      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
+                                       RegisterVT);
+  return RegisterVT;
+}
+
+unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
+  bool NonUnitFixedLengthVector =
+      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT VT2;
+  unsigned NumIntermediates;
+  return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
+                                              NumIntermediates, VT2);
+}
+
+unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
+      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+  if (!RegisterVT.isFixedLengthVector() ||
+      RegisterVT.getFixedSizeInBits() <= 128)
+    return NumRegs;
+
+  assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
+  assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
+  assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
+
+  // A size mismatch here implies either type promotion or widening and would
+  // have resulted in scalarisation if larger vectors had not be available.
+  if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
+    EVT EltTy = VT.getVectorElementType();
+    EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
+    if (!isTypeLegal(NewVT))
+      NewVT = EltTy;
+
+    IntermediateVT = NewVT;
+    NumIntermediates = VT.getVectorNumElements();
+    RegisterVT = getRegisterType(Context, NewVT);
+    return NumIntermediates;
+  }
+
+  // SVE VLS support does not introduce a new ABI so we should use NEON sized
+  // types for vector arguments and returns.
+
+  unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
+  NumIntermediates *= NumSubRegs;
+  NumRegs *= NumSubRegs;
+
+  switch (RegisterVT.getVectorElementType().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for vector");
+  case MVT::i8:
+    IntermediateVT = RegisterVT = MVT::v16i8;
+    break;
+  case MVT::i16:
+    IntermediateVT = RegisterVT = MVT::v8i16;
+    break;
+  case MVT::i32:
+    IntermediateVT = RegisterVT = MVT::v4i32;
+    break;
+  case MVT::i64:
+    IntermediateVT = RegisterVT = MVT::v2i64;
+    break;
+  case MVT::f16:
+    IntermediateVT = RegisterVT = MVT::v8f16;
+    break;
+  case MVT::f32:
+    IntermediateVT = RegisterVT = MVT::v4f32;
+    break;
+  case MVT::f64:
+    IntermediateVT = RegisterVT = MVT::v2f64;
+    break;
+  case MVT::bf16:
+    IntermediateVT = RegisterVT = MVT::v8bf16;
+    break;
+  }
+
+  return NumRegs;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 169b0dbab65cdca..c67c7c5affdc48e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -954,6 +954,18 @@ class AArch64TargetLowering : public TargetLowering {
   // used for 64bit and 128bit vectors as well.
   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
 
+  // Follow NEON ABI rules even when using SVE for fixed length vectors.
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+                                    EVT VT) const override;
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         CallingConv::ID CC,
+                                         EVT VT) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
index c34fcc59b3187b2..cea3915f3ea8dca 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
index 98c17d78189c76a..96f550826bb4ccc 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"