[llvm] [WIP] - [LLVM][SVE] Honour NEON calling convention when targeting SVE VLS. (PR #70847)

Thu Nov 23 10:23:04 PST 2023

https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/70847

>From 2f4066650ede384df622236daf4428bdbe021b3d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Thu, 23 Nov 2023 14:44:58 +0000
Subject: [PATCH 1/2] [LLVM][SVE] Add tests to prove SVE does not affect the
 calling convention.

NOTE: When SVE is enable some of these tests trigger asserts.
---
 .../sve-fixed-length-function-calls.ll        | 162 ++++++++++++++++++
 .../AArch64/sve-fixed-length-functions.ll     | 144 ++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
new file mode 100644
index 000000000000000..fb7bdddcd84b0c0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @foo_nxv32i8(<32 x i8>)
+define void @test_nxv32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_nxv32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv32i8
+  tail call void @foo_nxv32i8(<32 x i8> %a)
+  ret void
+}
+
+declare void @foo_nxv16i16(<16 x i16>)
+define void @test_nxv16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_nxv16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv16i16
+  tail call void @foo_nxv16i16(<16 x i16> %a)
+  ret void
+}
+
+declare void @foo_nxv8i32(<8 x i32>)
+define void @test_nxv8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv8i32
+  tail call void @foo_nxv8i32(<8 x i32> %a)
+  ret void
+}
+
+declare void @foo_nxv4i64(<4 x i64>)
+define void @test_nxv4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv4i64
+  tail call void @foo_nxv4i64(<4 x i64> %a)
+  ret void
+}
+
+declare void @foo_nxv16f16(<16 x half>)
+define void @test_nxv16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_nxv16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv16f16
+  tail call void @foo_nxv16f16(<16 x half> %a)
+  ret void
+}
+
+declare void @foo_nxv8f32(<8 x float>)
+define void @test_nxv8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv8f32
+  tail call void @foo_nxv8f32(<8 x float> %a)
+  ret void
+}
+
+declare void @foo_nxv4f64(<4 x double>)
+define void @test_nxv4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv4f64
+  tail call void @foo_nxv4f64(<4 x double> %a)
+  ret void
+}
+
+declare void @foo_nxv16bf16(<16 x bfloat>)
+define void @test_nxv16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_nxv16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    b foo_nxv16bf16
+  tail call void @foo_nxv16bf16(<16 x bfloat> %a)
+  ret void
+}
+
+declare void @foo_nxv3i64(<3 x i64>)
+define void @test_nxv3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    b foo_nxv3i64
+  tail call void @foo_nxv3i64(<3 x i64> %a)
+  ret void
+}
+
+declare void @foo_nxv5i64(<5 x i64>)
+define void @test_nxv5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    b foo_nxv5i64
+  tail call void @foo_nxv5i64(<5 x i64> %a)
+  ret void
+}
+
+declare void @foo_nxv9i16(<9 x i16>)
+define void @test_nxv9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_nxv9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [sp, #8]
+; CHECK-NEXT:    ldr w1, [sp, #16]
+; CHECK-NEXT:    ldr w2, [sp, #24]
+; CHECK-NEXT:    ldr w3, [sp, #32]
+; CHECK-NEXT:    ldr w4, [sp, #40]
+; CHECK-NEXT:    ldr w5, [sp, #48]
+; CHECK-NEXT:    ldr w6, [sp, #56]
+; CHECK-NEXT:    ldr w7, [sp, #64]
+; CHECK-NEXT:    ldr w8, [sp, #72]
+; CHECK-NEXT:    str w8, [sp]
+; CHECK-NEXT:    b foo_nxv9i16
+  tail call void @foo_nxv9i16(<9 x i16> %a)
+  ret void
+}
+
+declare void @foo_nxv16i1(<16 x i1>)
+define void @test_nxv16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    b foo_nxv16i1
+  tail call void @foo_nxv16i1(<16 x i1> %a)
+  ret void
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+declare void @foo_nxv32i1(<32 x i1>)
+define void @test_nxv32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         b foo_nxv32i1
+  tail call void @foo_nxv32i1(<32 x i1> %a)
+  ret void
+}
+; UTC_ARGS: --enable
+
+attributes #0 = { "target-features"="+sve,+bf16" nounwind }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
new file mode 100644
index 000000000000000..307f4a9ecb4b610
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <32 x i8> @test_nxv32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
+; CHECK-LABEL: test_nxv32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <32 x i8> %a
+}
+
+define <16 x i16> @test_nxv16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
+; CHECK-LABEL: test_nxv16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x i16> %a
+}
+
+define <8 x i32> @test_nxv8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
+; CHECK-LABEL: test_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x i32> %a
+}
+
+define <4 x i64> @test_nxv4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x i64> %a
+}
+
+define <16 x half> @test_nxv16f16(<16 x half> %unused, <16 x half> %a) #0 {
+; CHECK-LABEL: test_nxv16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x half> %a
+}
+
+define <8 x float> @test_nxv8f32(<8 x float> %unused, <8 x float> %a) #0 {
+; CHECK-LABEL: test_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <8 x float> %a
+}
+
+define <4 x double> @test_nxv4f64(<4 x double> %unused, <4 x double> %a) #0 {
+; CHECK-LABEL: test_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <4 x double> %a
+}
+
+define <16 x bfloat> @test_nxv16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
+; CHECK-LABEL: test_nxv16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  ret <16 x bfloat> %a
+}
+
+define <3 x i64> @test_nxv3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, d5
+; CHECK-NEXT:    fmov d1, d4
+; CHECK-NEXT:    fmov d0, d3
+; CHECK-NEXT:    ret
+  ret <3 x i64> %a
+}
+
+define <5 x i64> @test_nxv5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
+; CHECK-LABEL: test_nxv5i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, d6
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    fmov d2, d7
+; CHECK-NEXT:    ldp d3, d4, [sp]
+; CHECK-NEXT:    ret
+  ret <5 x i64> %a
+}
+
+define <9 x i16> @test_nxv9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
+; CHECK-LABEL: test_nxv9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [sp, #8]
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ld1 { v0.h }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1 { v0.h }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v0.h }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    ld1 { v0.h }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #56
+; CHECK-NEXT:    ld1 { v0.h }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v0.h }[7], [x9]
+; CHECK-NEXT:    ldrh w9, [sp, #72]
+; CHECK-NEXT:    strh w9, [x8, #16]
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    ret
+  ret <9 x i16> %a
+}
+
+define <16 x i1> @test_nxv16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
+; CHECK-LABEL: test_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  ret <16 x i1> %a
+}
+
+; UTC_ARGS: --disable
+; The output from this test is large and generally not useful, what matters is
+; no vector registers are used.
+define <32 x i1> @test_nxv32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
+; CHECK-LABEL: test_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NOT:     [q,v,z][0-9]+
+; CHECK:         ret
+  ret <32 x i1> %a
+}
+; UTC_ARGS: --enable
+
+attributes #0 = { "target-features"="+sve,+bf16" }

>From 9bef6a7a98ceecfac21d0f0d9eb3bd2e05acd5c7 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 31 Oct 2023 17:44:31 +0000
Subject: [PATCH 2/2] [LLVM][SVE] Honour calling convention when using SVE for
 fixed length vectors.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 92 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 12 +++
 .../AArch64/sve-fixed-length-concat.ll        |  9 +-
 .../sve-fixed-length-extract-subvector.ll     |  5 +-
 .../AArch64/sve-fixed-length-fp-to-int.ll     |  8 +-
 .../sve-fixed-length-function-calls.ll        |  2 +
 .../AArch64/sve-fixed-length-functions.ll     |  2 +
 .../AArch64/sve-fixed-length-int-to-fp.ll     |  8 +-
 .../sve-streaming-mode-fixed-length-concat.ll | 18 ++--
 ...ing-mode-fixed-length-extract-subvector.ll |  4 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 18 ++--
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  4 +-
 12 files changed, 151 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 28e038abcecfc8e..961e6eeeb53b6d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26718,3 +26718,95 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
   return Subtarget->getMinimumJumpTableEntries();
 }
+
+MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                         CallingConv::ID CC,
+                                                         EVT VT) const {
+  if (!VT.isFixedLengthVector() || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
+                                       RegisterVT);
+  return RegisterVT;
+}
+
+unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
+  if (!VT.isFixedLengthVector() || !Subtarget->useSVEForFixedLengthVectors())
+    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+  EVT VT1;
+  MVT VT2;
+  unsigned NumIntermediates;
+  return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
+                                              NumIntermediates, VT2);
+}
+
+unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
+      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+  if (!RegisterVT.isFixedLengthVector() ||
+      RegisterVT.getFixedSizeInBits() <= 128)
+    return NumRegs;
+
+  assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
+  assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
+  assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
+
+  // A size mismatch here implies either type promotion or widening and would
+  // have resulted in scalarisation if larger vectors had not be available.
+  if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
+    EVT EltTy = VT.getVectorElementType();
+    EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
+    if (!isTypeLegal(NewVT))
+      NewVT = EltTy;
+
+    IntermediateVT = NewVT;
+    NumIntermediates = VT.getVectorNumElements();
+    RegisterVT = getRegisterType(Context, NewVT);
+    return NumIntermediates;
+  }
+
+  // SVE VLS support does not introduce a new ABI so we should use NEON sized
+  // types for vector arguments and returns.
+
+  unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
+  NumIntermediates *= NumSubRegs;
+  NumRegs *= NumSubRegs;
+
+  switch (RegisterVT.getVectorElementType().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for vector");
+  case MVT::i8:
+    IntermediateVT = RegisterVT = MVT::v16i8;
+    break;
+  case MVT::i16:
+    IntermediateVT = RegisterVT = MVT::v8i16;
+    break;
+  case MVT::i32:
+    IntermediateVT = RegisterVT = MVT::v4i32;
+    break;
+  case MVT::i64:
+    IntermediateVT = RegisterVT = MVT::v2i64;
+    break;
+  case MVT::f16:
+    IntermediateVT = RegisterVT = MVT::v8i16;
+    break;
+  case MVT::f32:
+    IntermediateVT = RegisterVT = MVT::v4f32;
+    break;
+  case MVT::f64:
+    IntermediateVT = RegisterVT = MVT::v2f64;
+    break;
+  case MVT::bf16:
+    IntermediateVT = RegisterVT = MVT::v8bf16;
+    break;
+  }
+
+  return NumRegs;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 169b0dbab65cdca..c67c7c5affdc48e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -954,6 +954,18 @@ class AArch64TargetLowering : public TargetLowering {
   // used for 64bit and 128bit vectors as well.
   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
 
+  // Follow NEON ABI rules even when using SVE for fixed length vectors.
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+                                    EVT VT) const override;
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         CallingConv::ID CC,
+                                         EVT VT) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
index e54d22b140bf60c..d2938fe72f97048 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
@@ -304,7 +304,9 @@ define void @concat_v128i16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov v0.s[1], w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
@@ -647,7 +649,10 @@ define void @concat_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index 066c06d5aa76c59..8e5b9ae7394c677 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -194,7 +194,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    mov w0, v0.s[1]
 ; CHECK-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
@@ -449,7 +449,8 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #
 ; CHECK-LABEL: extract_subvector_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    mov v0.s[0], v0.s[1]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index da0cf927d74d24c..515f9f1f74b151d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -510,6 +510,7 @@ define void @fcvtzu_v64f32_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f32_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -615,7 +616,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -717,6 +718,7 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -1403,6 +1405,7 @@ define void @fcvtzs_v64f32_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f32_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1508,7 +1511,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -1610,6 +1613,7 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
index fb7bdddcd84b0c0..fe98101dcd5f7d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
index 307f4a9ecb4b610..ab3ac47d070870f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-functions.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 50040eaa61e6c57..cc38b0dc93b2e32 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -208,7 +208,7 @@ define void @ucvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i16_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
@@ -513,6 +513,7 @@ define void @ucvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i32_v1f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -724,6 +725,7 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
@@ -1121,7 +1123,7 @@ define void @scvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i16_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
@@ -1432,6 +1434,7 @@ define void @scvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i32_v1f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1649,6 +1652,7 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %res = sitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 86494c4be501230..f6dc2f4461e3be7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -180,10 +180,11 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ; CHECK-LABEL: concat_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    stp w0, w1, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
@@ -358,10 +359,11 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ; CHECK-LABEL: concat_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    stp s0, s1, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 25ecd7a8d7e32ef..456721725deafb3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -125,7 +125,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
@@ -224,7 +224,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index d6adf9cf0ad6723..7600a8be8e0dc32 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -392,8 +392,9 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f32_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -465,9 +466,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs w8, d0
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    fcvtzs w0, d0
 ; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -647,7 +646,7 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -1142,8 +1141,9 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f32_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1217,9 +1217,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs w8, d0
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    fcvtzs w0, d0
 ; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -1399,7 +1397,7 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index c110e89326cc0cd..c1c143588c0cdd7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -127,9 +127,7 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v1i16_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>