[llvm] 4db451a - [LLVM][SVE] Honour calling convention when using SVE for fixed length vectors. (#70847)

Thu Nov 30 04:10:04 PST 2023

Author: Paul Walker
Date: 2023-11-30T12:09:58Z
New Revision: 4db451a87d56ab469af2eecbae47338e540b1276

URL: https://github.com/llvm/llvm-project/commit/4db451a87d56ab469af2eecbae47338e540b1276
DIFF: https://github.com/llvm/llvm-project/commit/4db451a87d56ab469af2eecbae47338e540b1276.diff

LOG: [LLVM][SVE] Honour calling convention when using SVE for fixed length vectors. (#70847)

NOTE: I'm not sure how many of the corner cases are part of the
documented ABI but that shouldn't matter because my goal is for
`-msve-vector-bits` to have no affect on the way arguments and returns
are processed.

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 166b0d84a2ce28a..149a6d413d81442 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26719,3 +26719,99 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
   return Subtarget->getMinimumJumpTableEntries();
 }
+
+MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                         CallingConv::ID CC,
+                                                         EVT VT) const {
+  bool NonUnitFixedLengthVector =
+      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
+                                       RegisterVT);
+  return RegisterVT;
+}
+
+unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
+  bool NonUnitFixedLengthVector =
+      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT VT2;
+  unsigned NumIntermediates;
+  return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
+                                              NumIntermediates, VT2);
+}
+
+unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
+      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+  if (!RegisterVT.isFixedLengthVector() ||
+      RegisterVT.getFixedSizeInBits() <= 128)
+    return NumRegs;
+
+  assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
+  assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
+  assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
+
+  // A size mismatch here implies either type promotion or widening and would
+  // have resulted in scalarisation if larger vectors had not be available.
+  if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
+    EVT EltTy = VT.getVectorElementType();
+    EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
+    if (!isTypeLegal(NewVT))
+      NewVT = EltTy;
+
+    IntermediateVT = NewVT;
+    NumIntermediates = VT.getVectorNumElements();
+    RegisterVT = getRegisterType(Context, NewVT);
+    return NumIntermediates;
+  }
+
+  // SVE VLS support does not introduce a new ABI so we should use NEON sized
+  // types for vector arguments and returns.
+
+  unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
+  NumIntermediates *= NumSubRegs;
+  NumRegs *= NumSubRegs;
+
+  switch (RegisterVT.getVectorElementType().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for vector");
+  case MVT::i8:
+    IntermediateVT = RegisterVT = MVT::v16i8;
+    break;
+  case MVT::i16:
+    IntermediateVT = RegisterVT = MVT::v8i16;
+    break;
+  case MVT::i32:
+    IntermediateVT = RegisterVT = MVT::v4i32;
+    break;
+  case MVT::i64:
+    IntermediateVT = RegisterVT = MVT::v2i64;
+    break;
+  case MVT::f16:
+    IntermediateVT = RegisterVT = MVT::v8f16;
+    break;
+  case MVT::f32:
+    IntermediateVT = RegisterVT = MVT::v4f32;
+    break;
+  case MVT::f64:
+    IntermediateVT = RegisterVT = MVT::v2f64;
+    break;
+  case MVT::bf16:
+    IntermediateVT = RegisterVT = MVT::v8bf16;
+    break;
+  }
+
+  return NumRegs;
+}

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 169b0dbab65cdca..c67c7c5affdc48e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -954,6 +954,18 @@ class AArch64TargetLowering : public TargetLowering {
   // used for 64bit and 128bit vectors as well.
   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
 
+  // Follow NEON ABI rules even when using SVE for fixed length vectors.
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+                                    EVT VT) const override;
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         CallingConv::ID CC,
+                                         EVT VT) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for 
diff erent targets.

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
new file mode 100644
index 000000000000000..cea3915f3ea8dca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
@@ -0,0 +1,245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @foo_v32i8(<32 x i8>)
+define void @test_v32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v32i8
+  tail call void @foo_v32i8(<32 x i8> %a)
+  ret void
+}
+
+declare void @foo_v16i16(<16 x i16>)
+define void @test_v16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16i16
+  tail call void @foo_v16i16(<16 x i16> %a)
+  ret void
+}
+
+declare void @foo_v8i32(<8 x i32>)
+define void @test_v8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v8i32
+  tail call void @foo_v8i32(<8 x i32> %a)
+  ret void
+}
+
+declare void @foo_v4i64(<4 x i64>)
+define void @test_v4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v4i64
+  tail call void @foo_v4i64(<4 x i64> %a)
+  ret void
+}
+
+declare void @foo_v16f16(<16 x half>)
+define void @test_v16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16f16
+  tail call void @foo_v16f16(<16 x half> %a)
+  ret void
+}
+
+declare void @foo_v8f32(<8 x float>)
+define void @test_v8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v8f32
+  tail call void @foo_v8f32(<8 x float> %a)
+  ret void
+}
+
+declare void @foo_v4f64(<4 x double>)
+define void @test_v4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v4f64
+  tail call void @foo_v4f64(<4 x double> %a)
+  ret void
+}
+
+declare void @foo_v16bf16(<16 x bfloat>)
+define void @test_v16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_v16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v16bf16
+  tail call void @foo_v16bf16(<16 x bfloat> %a)
+  ret void
+}
+
+declare void @foo_v3i64(<3 x i64>)
+define void @test_v3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_v3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    b foo_v3i64
+  tail call void @foo_v3i64(<3 x i64> %a)
+  ret void
+}
+
+declare void @foo_v5i64(<5 x i64>)
+define void @test_v5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_v5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    b foo_v5i64
+  tail call void @foo_v5i64(<5 x i64> %a)
+  ret void
+}
+
+declare void @foo_v1i16(<1 x i16>)
+define void @test_v1i16(<1 x i16> %unused, <1 x i16> %a) #0 {
+; CHECK-LABEL: test_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    b foo_v1i16
+  tail call void @foo_v1i16(<1 x i16> %a)
+  ret void
+}
+
+declare void @foo_v9i16(<9 x i16>)
+define void @test_v9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_v9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [sp, #8]
+; CHECK-NEXT:    ldr w1, [sp, #16]
+; CHECK-NEXT:    ldr w2, [sp, #24]
+; CHECK-NEXT:    ldr w3, [sp, #32]
+; CHECK-NEXT:    ldr w4, [sp, #40]
+; CHECK-NEXT:    ldr w5, [sp, #48]
+; CHECK-NEXT:    ldr w6, [sp, #56]
+; CHECK-NEXT:    ldr w7, [sp, #64]
+; CHECK-NEXT:    ldr w8, [sp, #72]
+; CHECK-NEXT:    str w8, [sp]
+; CHECK-NEXT:    b foo_v9i16
+  tail call void @foo_v9i16(<9 x i16> %a)
+  ret void
+}
+
+declare void @foo_v16i1(<16 x i1>)
+define void @test_v16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    b foo_v16i1
+  tail call void @foo_v16i1(<16 x i1> %a)
+  ret void
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+declare void @foo_v32i1(<32 x i1>)
+define void @test_v32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_v32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         b foo_v32i1
+  tail call void @foo_v32i1(<32 x i1> %a)
+  ret void
+}
+; UTC_ARGS: --enable
+
+declare void @foo_v1i128(<1 x i128>)
+define void @test_v1i128(<1 x i128> %unused, <1 x i128> %a) #0 {
+; CHECK-LABEL: test_v1i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    b foo_v1i128
+  tail call void @foo_v1i128(<1 x i128> %a)
+  ret void
+}
+
+declare void @foo_v2i128(<2 x i128>)
+define void @test_v2i128(<2 x i128> %unused, <2 x i128> %a) #0 {
+; CHECK-LABEL: test_v2i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    b foo_v2i128
+  tail call void @foo_v2i128(<2 x i128> %a)
+  ret void
+}
+
+declare void @foo_v1i256(<1 x i256>)
+define void @test_v1i256(<1 x i256> %unused, <1 x i256> %a) #0 {
+; CHECK-LABEL: test_v1i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    b foo_v1i256
+  tail call void @foo_v1i256(<1 x i256> %a)
+  ret void
+}
+
+declare void @foo_v2i256(<2 x i256>)
+define void @test_v2i256(<2 x i256> %unused, <2 x i256> %a) #0 {
+; CHECK-LABEL: test_v2i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x0, x1, [sp]
+; CHECK-NEXT:    ldp x2, x3, [sp, #16]
+; CHECK-NEXT:    ldp x4, x5, [sp, #32]
+; CHECK-NEXT:    ldp x6, x7, [sp, #48]
+; CHECK-NEXT:    b foo_v2i256
+  tail call void @foo_v2i256(<2 x i256> %a)
+  ret void
+}
+
+declare void @foo_v1f128(<1 x fp128>)
+define void @test_v1f128(<1 x fp128> %unused, <1 x fp128> %a) #0 {
+; CHECK-LABEL: test_v1f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    b foo_v1f128
+  tail call void @foo_v1f128(<1 x fp128> %a)
+  ret void
+}
+
+declare void @foo_v2f128(<2 x fp128>)
+define void @test_v2f128(<2 x fp128> %unused, <2 x fp128> %a) #0 {
+; CHECK-LABEL: test_v2f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_v2f128
+  tail call void @foo_v2f128(<2 x fp128> %a)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve,+bf16" nounwind }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
new file mode 100644
index 000000000000000..96f550826bb4ccc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <32 x i8> @test_v32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <32 x i8> %a
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x i16> %a
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x i32> %a
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x i64> %a
+}
+
+define <16 x half> @test_v16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x half> %a
+}
+
+define <8 x float> @test_v8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x float> %a
+}
+
+define <4 x double> @test_v4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x double> %a
+}
+
+define <16 x bfloat> @test_v16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_v16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x bfloat> %a
+}
+
+define <3 x i64> @test_v3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_v3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    ret
+  ret <3 x i64> %a
+}
+
+define <5 x i64> @test_v5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_v5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    ret
+  ret <5 x i64> %a
+}
+
+define <1 x i16> @test_v1i16(<1 x i16> %unused, <1 x i16> %a) #0 {
+; CHECK-LABEL: test_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  ret <1 x i16> %a
+}
+
+define <9 x i16> @test_v9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_v9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [sp, #8]
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ld1 { v0.h }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1 { v0.h }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v0.h }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    ld1 { v0.h }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #56
+; CHECK-NEXT:    ld1 { v0.h }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v0.h }[7], [x9]
+; CHECK-NEXT:    ldrh w9, [sp, #72]
+; CHECK-NEXT:    strh w9, [x8, #16]
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    ret
+  ret <9 x i16> %a
+}
+
+define <16 x i1> @test_v16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  ret <16 x i1> %a
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+define <32 x i1> @test_v32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_v32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         ret
+  ret <32 x i1> %a
+}
+; UTC_ARGS: --enable
+
+define <1 x i128> @test_v1i128(<1 x i128> %unused, <1 x i128> %a) #0 {
+; CHECK-LABEL: test_v1i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+  ret <1 x i128> %a
+}
+
+define <2 x i128> @test_v2i128(<2 x i128> %unused, <2 x i128> %a) #0 {
+; CHECK-LABEL: test_v2i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    ret
+  ret <2 x i128> %a
+}
+
+define <1 x i256> @test_v1i256(<1 x i256> %unused, <1 x i256> %a) #0 {
+; CHECK-LABEL: test_v1i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x3, x7
+; CHECK-NEXT:    mov x2, x6
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    mov x1, x5
+; CHECK-NEXT:    ret
+  ret <1 x i256> %a
+}
+
+define <2 x i256> @test_v2i256(<2 x i256> %unused, <2 x i256> %a) #0 {
+; CHECK-LABEL: test_v2i256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x0, x1, [sp]
+; CHECK-NEXT:    ldp x2, x3, [sp, #16]
+; CHECK-NEXT:    ldp x4, x5, [sp, #32]
+; CHECK-NEXT:    ldp x6, x7, [sp, #48]
+; CHECK-NEXT:    ret
+  ret <2 x i256> %a
+}
+
+define <1 x fp128> @test_v1f128(<1 x fp128> %unused, <1 x fp128> %a) #0 {
+; CHECK-LABEL: test_v1f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  ret <1 x fp128> %a
+}
+
+define <2 x fp128> @test_v2f128(<2 x fp128> %unused, <2 x fp128> %a) #0 {
+; CHECK-LABEL: test_v2f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <2 x fp128> %a
+}
+
+attributes #0 = { "target-features"="+sve,+bf16" }