[llvm] [AArch64][GISel] Translate legal SVE formal arguments and select COPY for SVE (PR #95236)

Fri Jun 14 13:34:47 PDT 2024

https://github.com/SchrodingerZhu updated https://github.com/llvm/llvm-project/pull/95236

>From 1a3292b3fd652da532a581d33af22664ed0ba731 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 10 Jun 2024 16:26:44 +0100
Subject: [PATCH 1/6] [AArch64][GISel] Translate legal SVE formal arguments and
 select COPY for SVE

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../AArch64/GISel/AArch64CallLowering.cpp     |   6 +-
 .../GISel/AArch64InstructionSelector.cpp      |  19 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |   7 +-
 .../AArch64/GlobalISel/sve-formal-argument.ll |  45 ++
 .../translate-sve-formal-argument.ll          | 389 ++++++++++++++++++
 6 files changed, 456 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 394b741f1c1d0..347c72f76142c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -149,7 +149,7 @@ static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
 // scalable vector types for all instruction, even if SVE is not yet supported
 // with some instructions.
 // See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
-static cl::opt<bool> EnableSVEGISel(
+cl::opt<bool> EnableSVEGISel(
     "aarch64-enable-gisel-sve", cl::Hidden,
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 322bde3da6763..ba8229168b7ee 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -53,6 +53,8 @@
 using namespace llvm;
 using namespace AArch64GISelUtils;
 
+extern cl::opt<bool> EnableSVEGISel;
+
 AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
   : CallLowering(&TLI) {}
 
@@ -525,10 +527,10 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
 
 bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   auto &F = MF.getFunction();
-  if (F.getReturnType()->isScalableTy() ||
+  if (!EnableSVEGISel && (F.getReturnType()->isScalableTy() ||
       llvm::any_of(F.args(), [](const Argument &A) {
         return A.getType()->isScalableTy();
-      }))
+      })))
     return true;
   const auto &ST = MF.getSubtarget<AArch64Subtarget>();
   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 4a7c82b393c10..a23a31df1356c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -597,8 +597,14 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
 /// Given a register bank, and size in bits, return the smallest register class
 /// that can represent that combination.
 static const TargetRegisterClass *
-getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
+getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
                       bool GetAllRegSet = false) {
+  if (SizeInBits.isScalable()) {
+    assert(RB.getID() == AArch64::FPRRegBankID
+           && "Expected FPR regbank for scalable type size");
+    return &AArch64::ZPRRegClass;
+  }
+  
   unsigned RegBankID = RB.getID();
 
   if (RegBankID == AArch64::GPRRegBankID) {
@@ -939,8 +945,9 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
   Register SrcReg = I.getOperand(1).getReg();
   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
-  unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
-  unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+  TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+  TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
 
   // Special casing for cross-bank copies of s1s. We can technically represent
   // a 1-bit value with any size of register. The minimum size for a GPR is 32
@@ -951,7 +958,7 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
   // register bank. Or make a new helper that carries along some constraint
   // information.
   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
-    SrcSize = DstSize = 32;
+    SrcSize = DstSize = TypeSize::getFixed(32);
 
   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
           getMinClassForRegBank(DstRegBank, DstSize, true)};
@@ -1016,8 +1023,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
       return false;
     }
 
-    unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
-    unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+    const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
+    const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
     unsigned SubReg;
 
     // If the source bank doesn't support a subregister copy small enough,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index f63e29e442667..fb4676e57d923 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -258,6 +258,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::QQQRegClassID:
   case AArch64::QQQQRegClassID:
   case AArch64::ZPRRegClassID:
+  case AArch64::ZPR_3bRegClassID:
     return getRegBank(AArch64::FPRRegBankID);
   case AArch64::GPR32commonRegClassID:
   case AArch64::GPR32RegClassID:
@@ -714,10 +715,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // If both RB are null that means both registers are generic.
       // We shouldn't be here.
       assert(DstRB && SrcRB && "Both RegBank were nullptr");
-      unsigned Size = getSizeInBits(DstReg, MRI, TRI);
+      TypeSize Size = getSizeInBits(DstReg, MRI, TRI);
       return getInstructionMapping(
-          DefaultMappingID, copyCost(*DstRB, *SrcRB, TypeSize::getFixed(Size)),
-          getCopyMapping(DstRB->getID(), SrcRB->getID(), Size),
+          DefaultMappingID, copyCost(*DstRB, *SrcRB, Size),
+          getCopyMapping(DstRB->getID(), SrcRB->getID(), Size.getKnownMinValue()),
           // We only care about the mapping of the destination.
           /*NumOperands*/ 1);
     }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
new file mode 100644
index 0000000000000..32559f0898ff5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O0 -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s
+;; vscale x 128-bit
+
+define void @formal_argument_nxv16i8(<vscale x 16 x i8> %0, ptr %p) {
+; CHECK-LABEL: formal_argument_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  store <vscale x 16 x i8> %0, ptr %p, align 16
+  ret void
+}
+
+define void @formal_argument_nxv8i16(<vscale x 8 x i16> %0, ptr %p) {
+; CHECK-LABEL: formal_argument_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  store <vscale x 8 x i16> %0, ptr %p, align 16
+  ret void
+}
+
+define void @formal_argument_nxv4i32(<vscale x 4 x i32> %0, ptr %p) {
+; CHECK-LABEL: formal_argument_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  store <vscale x 4 x i32> %0, ptr %p, align 16
+  ret void
+}
+
+define void @formal_argument_nxv2i64(<vscale x 2 x i64> %0, ptr %p) {
+; CHECK-LABEL: formal_argument_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  store <vscale x 2 x i64> %0, ptr %p, align 16
+  ret void
+}
+
+;; TODO: Add tests for other types when store is supported for them.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument.ll b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument.ll
new file mode 100644
index 0000000000000..ec89da824779a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument.ll
@@ -0,0 +1,389 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O0 -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 \
+; RUN:     -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+
+;; vscale x 128-bit
+
+define void @formal_argument_nxv16i8(<vscale x 16 x i8> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i8
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8i16(<vscale x 8 x i16> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8i16
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4i32(<vscale x 4 x i32> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv4i32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv2i64(<vscale x 2 x i64> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv2i64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4f32(<vscale x 4 x float> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv4f32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv2f64(<vscale x 2 x double> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv2f64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv2p0(<vscale x 2 x ptr> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv2p0
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; vscale x 256-bit
+
+define void @formal_argument_nxv32i8(<vscale x 32 x i8> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i8
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s8>) = G_CONCAT_VECTORS [[COPY]](<vscale x 16 x s8>), [[COPY1]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16i16(<vscale x 16 x i16> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i16
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8i32(<vscale x 8 x i32> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8i32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4i64(<vscale x 4 x i64> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv4i64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8f32(<vscale x 8 x float> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8f32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4f64(<vscale x 4 x double> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv4f64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4p0(<vscale x 4 x ptr> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv4p0
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 4 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; vscale x 512-bit
+
+define void @formal_argument_nxv64i8(<vscale x 64 x i8> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv64i8
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 64 x s8>) = G_CONCAT_VECTORS [[COPY]](<vscale x 16 x s8>), [[COPY1]](<vscale x 16 x s8>), [[COPY2]](<vscale x 16 x s8>), [[COPY3]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32i16(<vscale x 32 x i16> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i16
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>), [[COPY2]](<vscale x 8 x s16>), [[COPY3]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16i32(<vscale x 16 x i32> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8i64(<vscale x 8 x i64> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8i64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16f32(<vscale x 16 x float> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16f32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8f64(<vscale x 8 x double> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8f64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8p0(<vscale x 8 x ptr> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv8p0
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; vscale x 1024-bit
+
+define void @formal_argument_nxv128i8(<vscale x 128 x i8> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv128i8
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 128 x s8>) = G_CONCAT_VECTORS [[COPY]](<vscale x 16 x s8>), [[COPY1]](<vscale x 16 x s8>), [[COPY2]](<vscale x 16 x s8>), [[COPY3]](<vscale x 16 x s8>), [[COPY4]](<vscale x 16 x s8>), [[COPY5]](<vscale x 16 x s8>), [[COPY6]](<vscale x 16 x s8>), [[COPY7]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv64i16(<vscale x 64 x i16> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv64i16
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 64 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>), [[COPY2]](<vscale x 8 x s16>), [[COPY3]](<vscale x 8 x s16>), [[COPY4]](<vscale x 8 x s16>), [[COPY5]](<vscale x 8 x s16>), [[COPY6]](<vscale x 8 x s16>), [[COPY7]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32i32(<vscale x 32 x i32> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>), [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16i64(<vscale x 16 x i64> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>), [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32f32(<vscale x 32 x float> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32f32
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>), [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16f64(<vscale x 16 x double> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16f64
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>), [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16p0(<vscale x 16 x ptr> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16p0
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>), [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}

>From c960a6ea80f6d97bcc4cbbd8b3828060d4289282 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 12 Jun 2024 13:37:50 +0100
Subject: [PATCH 2/6] Reformat code

---
 llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp       | 6 +++---
 .../lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 6 +++---
 llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp   | 3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index ba8229168b7ee..a93e74f120abf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -528,9 +528,9 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
 bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   auto &F = MF.getFunction();
   if (!EnableSVEGISel && (F.getReturnType()->isScalableTy() ||
-      llvm::any_of(F.args(), [](const Argument &A) {
-        return A.getType()->isScalableTy();
-      })))
+                          llvm::any_of(F.args(), [](const Argument &A) {
+                            return A.getType()->isScalableTy();
+                          })))
     return true;
   const auto &ST = MF.getSubtarget<AArch64Subtarget>();
   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a23a31df1356c..d32007ec45fb6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -600,11 +600,11 @@ static const TargetRegisterClass *
 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
                       bool GetAllRegSet = false) {
   if (SizeInBits.isScalable()) {
-    assert(RB.getID() == AArch64::FPRRegBankID
-           && "Expected FPR regbank for scalable type size");
+    assert(RB.getID() == AArch64::FPRRegBankID &&
+           "Expected FPR regbank for scalable type size");
     return &AArch64::ZPRRegClass;
   }
-  
+
   unsigned RegBankID = RB.getID();
 
   if (RegBankID == AArch64::GPRRegBankID) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index fb4676e57d923..8907add14d1ff 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -718,7 +718,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       TypeSize Size = getSizeInBits(DstReg, MRI, TRI);
       return getInstructionMapping(
           DefaultMappingID, copyCost(*DstRB, *SrcRB, Size),
-          getCopyMapping(DstRB->getID(), SrcRB->getID(), Size.getKnownMinValue()),
+          getCopyMapping(DstRB->getID(), SrcRB->getID(),
+                         Size.getKnownMinValue()),
           // We only care about the mapping of the destination.
           /*NumOperands*/ 1);
     }

>From 02a9229916a456f3b63424289b25f8429aeb6402 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 12 Jun 2024 14:40:06 +0100
Subject: [PATCH 3/6] Change getCopyMapping to take a TypeSize

---
 .../AArch64/AArch64GenRegisterBankInfo.def    | 26 ++++++----
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 47 ++++++++++---------
 .../AArch64/GISel/AArch64RegisterBankInfo.h   |  6 +--
 3 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index b87421e5ee46a..d243ca51cdcb9 100644
--- a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -137,7 +137,9 @@ bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx,
                                                    unsigned Offset) {
   unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min;
   const ValueMapping &Map =
-      AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset];
+      AArch64GenRegisterBankInfo::getValueMapping(
+        (PartialMappingIdx)FirstInBank,
+        TypeSize::getFixed(Size))[Offset];
   return Map.BreakDown == &PartMappings[PartialMapBaseIdx] &&
          Map.NumBreakDowns == 1;
 }
@@ -167,7 +169,7 @@ bool AArch64GenRegisterBankInfo::checkPartialMappingIdx(
 }
 
 unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
-                                                             unsigned Size) {
+                                                             TypeSize Size) {
   if (RBIdx == PMI_FirstGPR) {
     if (Size <= 32)
       return 0;
@@ -178,17 +180,20 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
     return -1;
   }
   if (RBIdx == PMI_FirstFPR) {
-    if (Size <= 16)
+    const unsigned MinSize = Size.getKnownMinValue();
+    assert(!Size.isScalable() || MinSize >= 128
+           && "Scalable vector types should have size of at least 128 bits");
+    if (MinSize <= 16)
       return 0;
-    if (Size <= 32)
+    if (MinSize <= 32)
       return 1;
-    if (Size <= 64)
+    if (MinSize <= 64)
       return 2;
-    if (Size <= 128)
+    if (MinSize <= 128)
       return 3;
-    if (Size <= 256)
+    if (MinSize <= 256)
       return 4;
-    if (Size <= 512)
+    if (MinSize <= 512)
       return 5;
     return -1;
   }
@@ -197,7 +202,7 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
 
 const RegisterBankInfo::ValueMapping *
 AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
-                                            unsigned Size) {
+                                            const TypeSize Size) {
   assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
   unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
   if (BaseIdxOffset == -1u)
@@ -221,7 +226,8 @@ const AArch64GenRegisterBankInfo::PartialMappingIdx
 
 const RegisterBankInfo::ValueMapping *
 AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
-                                           unsigned SrcBankID, unsigned Size) {
+                                           unsigned SrcBankID,
+                                           const TypeSize Size) {
   assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
   assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
   PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID];
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 8907add14d1ff..5616d063f70bc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -163,17 +163,18 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(
     unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min;               \
     (void)PartialMapDstIdx;                                                    \
     (void)PartialMapSrcIdx;                                                    \
-    const ValueMapping *Map = getCopyMapping(                                  \
-        AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size);  \
+    const ValueMapping *Map = getCopyMapping(AArch64::RBNameDst##RegBankID,    \
+                                             AArch64::RBNameSrc##RegBankID,    \
+                                             TypeSize::getFixed(Size));        \
     (void)Map;                                                                 \
     assert(Map[0].BreakDown ==                                                 \
                &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] &&  \
-           Map[0].NumBreakDowns == 1 && #RBNameDst #Size                       \
-           " Dst is incorrectly initialized");                                 \
+           Map[0].NumBreakDowns == 1 &&                                        \
+           #RBNameDst #Size " Dst is incorrectly initialized");                \
     assert(Map[1].BreakDown ==                                                 \
                &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] &&  \
-           Map[1].NumBreakDowns == 1 && #RBNameSrc #Size                       \
-           " Src is incorrectly initialized");                                 \
+           Map[1].NumBreakDowns == 1 &&                                        \
+           #RBNameSrc #Size " Src is incorrectly initialized");                \
                                                                                \
   } while (false)
 
@@ -218,7 +219,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(
 
 unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
                                            const RegisterBank &B,
-                                           TypeSize Size) const {
+                                           const TypeSize Size) const {
   // What do we do with different size?
   // copy are same size.
   // Will introduce other hooks for different size:
@@ -305,7 +306,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
   case TargetOpcode::G_OR: {
     // 32 and 64-bit or can be mapped on either FPR or
     // GPR for the same cost.
-    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
     if (Size != 32 && Size != 64)
       break;
 
@@ -326,7 +327,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     return AltMappings;
   }
   case TargetOpcode::G_BITCAST: {
-    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
     if (Size != 32 && Size != 64)
       break;
 
@@ -366,7 +367,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     return AltMappings;
   }
   case TargetOpcode::G_LOAD: {
-    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
     if (Size != 64)
       break;
 
@@ -378,15 +379,17 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     InstructionMappings AltMappings;
     const InstructionMapping &GPRMapping = getInstructionMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        getOperandsMapping({getValueMapping(PMI_FirstGPR, Size),
-                            // Addresses are GPR 64-bit.
-                            getValueMapping(PMI_FirstGPR, 64)}),
+        getOperandsMapping(
+            {getValueMapping(PMI_FirstGPR, Size),
+             // Addresses are GPR 64-bit.
+             getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}),
         /*NumOperands*/ 2);
     const InstructionMapping &FPRMapping = getInstructionMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        getOperandsMapping({getValueMapping(PMI_FirstFPR, Size),
-                            // Addresses are GPR 64-bit.
-                            getValueMapping(PMI_FirstGPR, 64)}),
+        getOperandsMapping(
+            {getValueMapping(PMI_FirstFPR, Size),
+             // Addresses are GPR 64-bit.
+             getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}),
         /*NumOperands*/ 2);
 
     AltMappings.push_back(&GPRMapping);
@@ -438,7 +441,7 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
          "This code is for instructions with 3 or less operands");
 
   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-  unsigned Size = Ty.getSizeInBits();
+  TypeSize Size = Ty.getSizeInBits();
   bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
 
   PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR;
@@ -718,8 +721,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       TypeSize Size = getSizeInBits(DstReg, MRI, TRI);
       return getInstructionMapping(
           DefaultMappingID, copyCost(*DstRB, *SrcRB, Size),
-          getCopyMapping(DstRB->getID(), SrcRB->getID(),
-                         Size.getKnownMinValue()),
+          getCopyMapping(DstRB->getID(), SrcRB->getID(), Size),
           // We only care about the mapping of the destination.
           /*NumOperands*/ 1);
     }
@@ -729,7 +731,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_BITCAST: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
     LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
-    unsigned Size = DstTy.getSizeInBits();
+    TypeSize Size = DstTy.getSizeInBits();
     bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64;
     bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64;
     const RegisterBank &DstRB =
@@ -737,7 +739,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     const RegisterBank &SrcRB =
         SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
     return getInstructionMapping(
-        DefaultMappingID, copyCost(DstRB, SrcRB, TypeSize::getFixed(Size)),
+        DefaultMappingID, copyCost(DstRB, SrcRB, Size),
         getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
         // We only care about the mapping of the destination for COPY.
         /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1);
@@ -1128,7 +1130,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       LLT Ty = MRI.getType(MI.getOperand(Idx).getReg());
       if (!Ty.isValid())
         continue;
-      auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+      auto Mapping =
+          getValueMapping(OpRegBankIdx[Idx], TypeSize::getFixed(OpSize[Idx]));
       if (!Mapping->isValid())
         return getInvalidInstructionMapping();
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 9d23f8480f779..0d89f540650a9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -70,7 +70,7 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo {
                                      PartialMappingIdx LastAlias,
                                      ArrayRef<PartialMappingIdx> Order);
 
-  static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size);
+  static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, TypeSize Size);
 
   /// Get the pointer to the ValueMapping representing the RegisterBank
   /// at \p RBIdx with a size of \p Size.
@@ -80,13 +80,13 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo {
   ///
   /// \pre \p RBIdx != PartialMappingIdx::None
   static const RegisterBankInfo::ValueMapping *
-  getValueMapping(PartialMappingIdx RBIdx, unsigned Size);
+  getValueMapping(PartialMappingIdx RBIdx, TypeSize Size);
 
   /// Get the pointer to the ValueMapping of the operands of a copy
   /// instruction from the \p SrcBankID register bank to the \p DstBankID
   /// register bank with a size of \p Size.
   static const RegisterBankInfo::ValueMapping *
-  getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+  getCopyMapping(unsigned DstBankID, unsigned SrcBankID, TypeSize Size);
 
   /// Get the instruction mapping for G_FPEXT.
   ///

>From 52ddee622078dd34a31428410a402408458fa176 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Thu, 13 Jun 2024 12:56:05 +0100
Subject: [PATCH 4/6] Add tests for multiple SVE legal arguments

---
 .../sve-formal-argument-multiple.ll           |  44 ++
 .../translate-sve-formal-argument-multiple.ll | 436 ++++++++++++++++++
 2 files changed, 480 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument-multiple.ll
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument-multiple.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument-multiple.ll
new file mode 100644
index 0000000000000..621f6208ccb14
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument-multiple.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s
+
+;; Test the correct usage of the Z registers with multiple SVE arguments.
+
+define void @formal_argument_nxv16i8_2(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, ptr %p) {
+; CHECK-LABEL: formal_argument_nxv16i8_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  store <vscale x 16 x i8> %0, ptr %p, align 16
+  store <vscale x 16 x i8> %1, ptr %p, align 16
+  ret void
+}
+
+define void @formal_argument_nxv16i8_8(
+; CHECK-LABEL: formal_argument_nxv16i8_8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z2.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z3.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z4.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z5.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z6.b }, p0, [x0]
+; CHECK-NEXT:    st1b { z7.b }, p0, [x0]
+; CHECK-NEXT:    ret
+    <vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3,
+    <vscale x 16 x i8> %4, <vscale x 16 x i8> %5, <vscale x 16 x i8> %6, <vscale x 16 x i8> %7,
+    ptr %p) {
+
+  store <vscale x 16 x i8> %0, ptr %p, align 16
+  store <vscale x 16 x i8> %1, ptr %p, align 16
+  store <vscale x 16 x i8> %2, ptr %p, align 16
+  store <vscale x 16 x i8> %3, ptr %p, align 16
+  store <vscale x 16 x i8> %4, ptr %p, align 16
+  store <vscale x 16 x i8> %5, ptr %p, align 16
+  store <vscale x 16 x i8> %6, ptr %p, align 16
+  store <vscale x 16 x i8> %7, ptr %p, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll
new file mode 100644
index 0000000000000..c54000f4a43c0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O0 -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 \
+; RUN:     -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+
+;; Test the correct usage of the Z registers with multiple SVE arguments.
+
+;; Mixing SVE types
+
+define void @formal_argument_mix_sve(
+  ; CHECK-LABEL: name: formal_argument_mix_sve
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x p0>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 8 x i16> %0, <vscale x 8 x float> %1, <vscale x 16 x i8> %2, <vscale x 8 x ptr> %3
+) {
+  ret void
+}
+
+;; Mixing SVE and non-SVE types
+
+define void @formal_argument_mix_sve_double(
+  ; CHECK-LABEL: name: formal_argument_mix_sve_double
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $d1, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 8 x i16> %0, double %1
+) {
+  ret void
+}
+
+define void @formal_argument_mix_sve_int_double(
+  ; CHECK-LABEL: name: formal_argument_mix_sve_int_double
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $d0, $w0, $z1, $z2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   RET_ReallyLR
+    double %0, <vscale x 8 x i16> %1, i32 %2,  <vscale x 8 x i16> %3
+) {
+  ret void
+}
+
+;; 1024-bit cases which fit into Z0-Z7
+;; TODO: Add tests for arguments that do not fit into Z0-Z7, when we support them. (They should be passed as memory addresses)
+
+;; nxv4___ 1024-bit
+define void @formal_argument_nxv4i64_4(
+  ; CHECK-LABEL: name: formal_argument_nxv4i64_4
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3,
+    <vscale x 8 x i16> %4, <vscale x 8 x i16> %5, <vscale x 8 x i16> %6, <vscale x 8 x i16> %7
+) {
+  ret void
+}
+
+;; nxv8___ 1024-bit
+define void @formal_argument_nxv8i16_8(
+  ; CHECK-LABEL: name: formal_argument_nxv8i16_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3,
+    <vscale x 8 x i16> %4, <vscale x 8 x i16> %5, <vscale x 8 x i16> %6, <vscale x 8 x i16> %7
+) {
+  ret void
+}
+
+define void @formal_argument_nxv8i32_4(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv8i32_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8f32_4(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv8f32_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 8 x s32>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8i64_2(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv8i64_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8f64_2(<vscale x 8 x double> %0, <vscale x 8 x double> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv8f64_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv8p0_2(<vscale x 8 x ptr> %0, <vscale x 8 x ptr> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv8p0_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 8 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 8 x p0>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; nxv16___ 1024-bit
+
+define void @formal_argument_nxv16i8_8(
+  ; CHECK-LABEL: name: formal_argument_nxv16i8_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3,
+    <vscale x 16 x i8> %4, <vscale x 16 x i8> %5, <vscale x 16 x i8> %6, <vscale x 16 x i8> %7
+) {
+  ret void
+}
+
+define void @formal_argument_nxv16i16_4(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i16_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 8 x s16>), [[COPY3]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 16 x s16>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 8 x s16>), [[COPY5]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 16 x s16>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 8 x s16>), [[COPY7]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16i32_2(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i32_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16f32_2(<vscale x 16 x float> %0, <vscale x 16 x float> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv16f32_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16i64_1(<vscale x 16 x i64> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16i64_1
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>), [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv16p0_1(<vscale x 16 x ptr> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv16p0_1
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 16 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>), [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>), [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>), [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; nxv32___ 1024-bit
+
+define void @formal_argument_nxv32i8_4(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i8_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s8>) = G_CONCAT_VECTORS [[COPY]](<vscale x 16 x s8>), [[COPY1]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 16 x s8>), [[COPY3]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 32 x s8>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 16 x s8>), [[COPY5]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 32 x s8>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 16 x s8>), [[COPY7]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32i16_2(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i16_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>), [[COPY2]](<vscale x 8 x s16>), [[COPY3]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 8 x s16>), [[COPY5]](<vscale x 8 x s16>), [[COPY6]](<vscale x 8 x s16>), [[COPY7]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32i32_1(<vscale x 32 x i32> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32i32_1
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>), [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv32f32_1(<vscale x 32 x float> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv32f32_1
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 32 x s32>) = G_CONCAT_VECTORS [[COPY]](<vscale x 4 x s32>), [[COPY1]](<vscale x 4 x s32>), [[COPY2]](<vscale x 4 x s32>), [[COPY3]](<vscale x 4 x s32>), [[COPY4]](<vscale x 4 x s32>), [[COPY5]](<vscale x 4 x s32>), [[COPY6]](<vscale x 4 x s32>), [[COPY7]](<vscale x 4 x s32>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+;; nxv64___ 1024-bit
+
+define void @formal_argument_nxv64i8_2(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1) {
+  ; CHECK-LABEL: name: formal_argument_nxv64i8_2
+  ; CHECK: bb.1 (%ir-block.2):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 64 x s8>) = G_CONCAT_VECTORS [[COPY]](<vscale x 16 x s8>), [[COPY1]](<vscale x 16 x s8>), [[COPY2]](<vscale x 16 x s8>), [[COPY3]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 16 x s8>), [[COPY5]](<vscale x 16 x s8>), [[COPY6]](<vscale x 16 x s8>), [[COPY7]](<vscale x 16 x s8>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv64i16_1(<vscale x 64 x i16> %0) {
+  ; CHECK-LABEL: name: formal_argument_nxv64i16_1
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 64 x s16>) = G_CONCAT_VECTORS [[COPY]](<vscale x 8 x s16>), [[COPY1]](<vscale x 8 x s16>), [[COPY2]](<vscale x 8 x s16>), [[COPY3]](<vscale x 8 x s16>), [[COPY4]](<vscale x 8 x s16>), [[COPY5]](<vscale x 8 x s16>), [[COPY6]](<vscale x 8 x s16>), [[COPY7]](<vscale x 8 x s16>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}

>From 3cf1155b9552e405d387a49a122a230d5f4a8963 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Thu, 13 Jun 2024 13:47:22 +0100
Subject: [PATCH 5/6] Remove `-O0`

---
 llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
index 32559f0898ff5..aa9a671087f41 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-formal-argument.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -O0 -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -global-isel-abort=1 -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s
 ;; vscale x 128-bit
 
 define void @formal_argument_nxv16i8(<vscale x 16 x i8> %0, ptr %p) {

>From 14736f5841fb508717c7af5a2c1cb18ba0de63a0 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Thu, 13 Jun 2024 14:13:36 +0100
Subject: [PATCH 6/6] Add more tests

---
 .../translate-sve-formal-argument-multiple.ll | 147 ++++++++++++++++--
 1 file changed, 135 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll
index c54000f4a43c0..28d53dab9d99f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/translate-sve-formal-argument-multiple.ll
@@ -60,27 +60,150 @@ define void @formal_argument_mix_sve_int_double(
 ;; 1024-bit cases which fit into Z0-Z7
 ;; TODO: Add tests for arguments that do not fit into Z0-Z7, when we support them. (They should be passed as memory addresses)
 
+;; nxv2___ 1024-bit
+define void @formal_argument_nxv2i64_8(
+  ; CHECK-LABEL: name: formal_argument_nxv2i64_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3,
+    <vscale x 2 x i64> %4, <vscale x 2 x i64> %5, <vscale x 2 x i64> %6, <vscale x 2 x i64> %7
+) {
+  ret void
+}
+
+define void @formal_argument_nxv2f64_8(
+  ; CHECK-LABEL: name: formal_argument_nxv2f64_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3,
+    <vscale x 2 x double> %4, <vscale x 2 x double> %5, <vscale x 2 x double> %6, <vscale x 2 x double> %7
+) {
+  ret void
+}
+
+define void @formal_argument_nxv2p0_8(
+  ; CHECK-LABEL: name: formal_argument_nxv2p0_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 2 x ptr> %0, <vscale x 2 x ptr> %1, <vscale x 2 x ptr> %2, <vscale x 2 x ptr> %3,
+    <vscale x 2 x ptr> %4, <vscale x 2 x ptr> %5, <vscale x 2 x ptr> %6, <vscale x 2 x ptr> %7
+) {
+  ret void
+}
+
 ;; nxv4___ 1024-bit
-define void @formal_argument_nxv4i64_4(
-  ; CHECK-LABEL: name: formal_argument_nxv4i64_4
+define void @formal_argument_nxv4i32_8(
+  ; CHECK-LABEL: name: formal_argument_nxv4i32_8
   ; CHECK: bb.1 (%ir-block.8):
   ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z3
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z4
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z5
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z6
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $z7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
   ; CHECK-NEXT:   RET_ReallyLR
-    <vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3,
-    <vscale x 8 x i16> %4, <vscale x 8 x i16> %5, <vscale x 8 x i16> %6, <vscale x 8 x i16> %7
+    <vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3,
+    <vscale x 4 x i32> %4, <vscale x 4 x i32> %5, <vscale x 4 x i32> %6, <vscale x 4 x i32> %7
 ) {
   ret void
 }
 
+define void @formal_argument_nxv4f32_8(
+  ; CHECK-LABEL: name: formal_argument_nxv4f32_8
+  ; CHECK: bb.1 (%ir-block.8):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $z7
+  ; CHECK-NEXT:   RET_ReallyLR
+    <vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3,
+    <vscale x 4 x float> %4, <vscale x 4 x float> %5, <vscale x 4 x float> %6, <vscale x 4 x float> %7
+) {
+  ret void
+}
+
+define void @formal_argument_nxv4i64_4(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv4i64_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 4 x s64>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
+define void @formal_argument_nxv4p0_4(<vscale x 4 x ptr> %0, <vscale x 4 x ptr> %1, <vscale x 4 x ptr> %2, <vscale x 4 x ptr> %3) {
+  ; CHECK-LABEL: name: formal_argument_nxv4p0_4
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK-NEXT:   liveins: $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z1
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<vscale x 4 x p0>) = G_CONCAT_VECTORS [[COPY]](<vscale x 2 x s64>), [[COPY1]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z3
+  ; CHECK-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<vscale x 4 x p0>) = G_CONCAT_VECTORS [[COPY2]](<vscale x 2 x s64>), [[COPY3]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z5
+  ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<vscale x 4 x p0>) = G_CONCAT_VECTORS [[COPY4]](<vscale x 2 x s64>), [[COPY5]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z6
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $z7
+  ; CHECK-NEXT:   [[CONCAT_VECTORS3:%[0-9]+]]:_(<vscale x 4 x p0>) = G_CONCAT_VECTORS [[COPY6]](<vscale x 2 x s64>), [[COPY7]](<vscale x 2 x s64>)
+  ; CHECK-NEXT:   RET_ReallyLR
+  ret void
+}
+
 ;; nxv8___ 1024-bit
 define void @formal_argument_nxv8i16_8(
   ; CHECK-LABEL: name: formal_argument_nxv8i16_8