[llvm] a7a96c7 - [AArch64] Implement passing SVE vectors by ref for AAPCS.

Mon Feb 17 07:21:01 PST 2020

Author: Sander de Smalen
Date: 2020-02-17T15:20:28Z
New Revision: a7a96c726e884101b09ca8bea1944e2f5b3950e7

URL: https://github.com/llvm/llvm-project/commit/a7a96c726e884101b09ca8bea1944e2f5b3950e7
DIFF: https://github.com/llvm/llvm-project/commit/a7a96c726e884101b09ca8bea1944e2f5b3950e7.diff

LOG: [AArch64] Implement passing SVE vectors by ref for AAPCS.

Summary:
This patch implements the part of the calling convention
where SVE Vectors are passed by reference. This means the
caller must allocate stack space for these objects and
pass the address to the callee.

Reviewers: efriedma, rovka, cameron.mcinally, c-rhodes, rengolin

Reviewed By: efriedma

Subscribers: tschuett, kristof.beyls, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71216

Added: 
    llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
    llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d9ed4d4c039f..968d5a995fac 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14943,6 +14943,12 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
       !LD->getValueType(0).isInteger())
     return false;
 
+  // The algorithm to split up a load of a scalable vector into individual
+  // elements currently requires knowing the length of the loaded type,
+  // so will need adjusting to work on scalable vectors.
+  if (LD->getValueType(0).isScalableVector())
+    return false;
+
   // Keep track of already used bits to detect overlapping values.
   // In that case, we will just abort the transformation.
   APInt UsedBits(LD->getValueSizeInBits(0), 0);
@@ -16579,7 +16585,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       }
 
       if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
-          !ST1->getBasePtr().isUndef()) {
+          !ST1->getBasePtr().isUndef() &&
+          // BaseIndexOffset and the code below requires knowing the size
+          // of a vector, so bail out if MemoryVT is scalable.
+          !ST1->getMemoryVT().isScalableVector()) {
         const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
         const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
         unsigned STBitSize = ST->getMemoryVT().getSizeInBits();

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 86ce09152417..59dd4905de5a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3331,9 +3331,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention.");
-  case CallingConv::AArch64_SVE_VectorCall:
-    // Calling SVE functions is currently not yet supported.
-    report_fatal_error("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:
@@ -3356,6 +3353,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    case CallingConv::CFGuard_Check:
      return CC_AArch64_Win64_CFGuard_Check;
    case CallingConv::AArch64_VectorCall:
+   case CallingConv::AArch64_SVE_VectorCall:
      return CC_AArch64_AAPCS;
   }
 }
@@ -3474,7 +3472,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
@@ -3491,7 +3489,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
+                              ? VA.getLocVT().getSizeInBits()
+                              : VA.getValVT().getSizeInBits()) / 8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -3517,7 +3517,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        MemVT = VA.getLocVT();
+        break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
@@ -3535,6 +3536,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           MemVT);
 
     }
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      assert(VA.getValVT().isScalableVector() &&
+           "Only scalable vectors can be passed indirectly");
+      // If value is passed via pointer - do a load.
+      ArgValue =
+          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+    }
+
     if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
       ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
                              ArgValue, DAG.getValueType(MVT::i32));
@@ -3895,6 +3905,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
+  // If any of the arguments is passed indirectly, it must be SVE, so the
+  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
+  // allocate space on the stack. That is why we determine this explicitly here
+  // the call cannot be a tailcall.
+  if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
+        assert((A.getLocInfo() != CCValAssign::Indirect ||
+                A.getValVT().isScalableVector()) &&
+               "Expected value to be scalable");
+        return A.getLocInfo() == CCValAssign::Indirect;
+      }))
+    return false;
+
   // If the stack arguments for this call do not fit into our own save area then
   // the call cannot be made tail.
   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
@@ -4135,7 +4157,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
-      llvm_unreachable("Spilling of SVE vectors not yet implemented");
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
+      unsigned Align = DAG.getDataLayout().getPrefTypeAlignment(Ty);
+      int FI = MFI.CreateStackObject(
+          VA.getValVT().getStoreSize().getKnownMinSize(), Align, false);
+      MFI.setStackID(FI, TargetStackID::SVEVector);
+
+      SDValue SpillSlot = DAG.getFrameIndex(
+          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+      Chain = DAG.getStore(
+          Chain, DL, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      Arg = SpillSlot;
+      break;
     }
 
     if (VA.isRegLoc()) {
@@ -4183,8 +4218,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       // FIXME: This works on big-endian for composite byvals, which are the
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
-      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getValVT().getSizeInBits();
+      unsigned OpSize;
+      if (VA.getLocInfo() == CCValAssign::Indirect)
+        OpSize = VA.getLocVT().getSizeInBits();
+      else
+        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                 : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
           !Flags.isInConsecutiveRegs()) {

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index f2d0d963d621..e188fa4e2fce 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1288,6 +1288,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_store<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
     def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
                        (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+
+    def _default : Pat<(store (Ty ZPR:$val), GPR64:$base),
+                       (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
   defm Pat_ST1B        : unpred_store<nxv16i8, ST1B_IMM, PTRUE_B>;
@@ -1301,6 +1304,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_load<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
     def _fi : Pat<(Ty (load  (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
                   (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+
+    def _default : Pat<(Ty (load GPR64:$base)),
+                       (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
   defm Pat_LD1B        : unpred_load<nxv16i8, LD1B_IMM, PTRUE_B>;
@@ -1314,6 +1320,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
     def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
                   (Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
+                  (Store PPR:$Val, GPR64:$base, (i64 0))>;
   }
 
   defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
@@ -1324,6 +1333,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
     def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
                   (Load GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(Ty (load GPR64:$base)),
+                  (Load GPR64:$base, (i64 0))>;
   }
 
   defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll b/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
new file mode 100644
index 000000000000..ca29e15697fe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
@@ -0,0 +1,29 @@
+; Because some arguments are passed by reference (through stack),
+; the compiler should not do tail-call optimization.
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+; CHECK-LABEL: caller:
+; CHECK:       addvl sp, sp, #-[[STACKSIZE:[0-9]+]]
+; CHECK-NOT:   addvl sp
+; CHECK:       bl callee
+; CHECK:       addvl sp, sp, #[[STACKSIZE]]
+; CHECK:       ret
+define <vscale x 16 x i8> @caller(<vscale x 16 x i8> %v) {
+  %1 = tail call <vscale x 16 x i8> @callee(<vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v)
+  ret <vscale x 16 x i8> %1
+}
+
+declare <vscale x 16 x i8> @callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+; CHECK-LABEL: caller_pred:
+; CHECK:       addvl sp, sp, #-[[STACKSIZE:[0-9]+]]
+; CHECK-NOT:   addvl sp
+; CHECK:       bl callee_pred
+; CHECK:       addvl sp, sp, #[[STACKSIZE]]
+; CHECK:       ret
+define <vscale x 16 x i1> @caller_pred(<vscale x 16 x i1> %v) {
+  %1 = tail call <vscale x 16 x i1> @callee_pred(<vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v)
+  ret <vscale x 16 x i1> %1
+}
+
+declare <vscale x 16 x i1> @callee_pred(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
new file mode 100644
index 000000000000..bbb8209941b0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -0,0 +1,118 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -stop-after=finalize-isel < %s | FileCheck %s
+
+; Test that z8 and z9, passed in by reference, are correctly loaded from x0 and x1.
+; i.e. z0 =  %z0
+;         :
+;      z7 =  %z7
+;      x0 = &%z8
+;      x1 = &%z9
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_sve_arg(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5, <vscale x 4 x i32> %z6, <vscale x 4 x i32> %z7, <vscale x 4 x i32> %z8, <vscale x 4 x i32> %z9) {
+; CHECK: name: callee_with_many_sve_arg
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1
+; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], [[BASE]]
+; CHECK-DAG: $z0 = COPY [[RES]]
+; CHECK:     RET_ReallyLR implicit $z0
+  ret <vscale x 4 x i32> %z9
+}
+
+; Test that z8 and z9 are passed by reference.
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @caller_with_many_sve_arg(<vscale x 4 x i32> %z) {
+; CHECK: name: caller_with_many_sve_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG:  [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG:  ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.1, 0
+; CHECK-DAG:  ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.0, 0
+; CHECK-DAG:  [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0
+; CHECK-DAG:  [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
+; CHECK-DAG:  $x0 = COPY [[BASE1]]
+; CHECK-DAG:  $x1 = COPY [[BASE2]]
+; CHECK-NEXT: BL @callee_with_many_sve_arg
+; CHECK:      RET_ReallyLR implicit $z0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_sve_arg(<vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z)
+  ret <vscale x 4 x i32> %ret
+}
+
+; Test that p4 and p5, passed in by reference, are correctly loaded from register x0 and x1.
+; i.e. p0 =  %p0
+;         :
+;      p3 =  %p3
+;      x0 = &%p4
+;      x1 = &%p5
+define aarch64_sve_vector_pcs <vscale x 4 x i1> @callee_with_many_svepred_arg(<vscale x 4 x i1> %p0, <vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, <vscale x 4 x i1> %p3, <vscale x 4 x i1> %p4, <vscale x 4 x i1> %p5) {
+; CHECK: name: callee_with_many_svepred_arg
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1
+; CHECK-DAG: [[RES:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0
+; CHECK-DAG: $p0 = COPY [[RES]]
+; CHECK:     RET_ReallyLR implicit $p0
+  ret <vscale x 4 x i1> %p5
+}
+
+; Test that p4 and p5 are passed by reference.
+define aarch64_sve_vector_pcs <vscale x 4 x i1> @caller_with_many_svepred_arg(<vscale x 4 x i1> %p) {
+; CHECK: name: caller_with_many_svepred_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 1, alignment: 4,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 1, alignment: 4,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
+; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
+; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
+; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0
+; CHECK-DAG: $x0 = COPY [[BASE1]]
+; CHECK-DAG: $x1 = COPY [[BASE2]]
+; CHECK-NEXT: BL @callee_with_many_svepred_arg
+; CHECK:     RET_ReallyLR implicit $p0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i1> @callee_with_many_svepred_arg(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+  ret <vscale x 4 x i1> %ret
+}
+
+; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
+; i.e.     x0 =   %x0
+;             :
+;          x7 =   %x7
+;          z0 =   %z0
+;             :
+;          z7 =   %z7
+;        [sp] =  &%z8
+;      [sp+8] =  &%z9
+;
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_gpr_sve_arg(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, <vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5, <vscale x 4 x i32> %z6, <vscale x 4 x i32> %z7, <vscale x 2 x i64> %z8, <vscale x 4 x i32> %z9) {
+; CHECK: name: callee_with_many_gpr_sve_arg
+; CHECK: fixedStack:
+; CHECK:      - { id: 0, type: default, offset: 8, size: 8, alignment: 8, stack-id: default,
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = LDRXui %fixed-stack.0, 0
+; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], killed [[BASE]]
+; CHECK-DAG: $z0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $z0
+  ret <vscale x 4 x i32> %z9
+}
+
+; Test that z8 and z9 are passed by reference, where reference is passed on the stack.
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @caller_with_many_gpr_sve_arg(i64 %x, <vscale x 4 x i32> %z, <vscale x 2 x i64> %z2) {
+; CHECK: name: caller_with_many_gpr_sve_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG: [[PTRUE_S:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[PTRUE_D:%[0-9]+]]:ppr_3b = PTRUE_D 31
+; CHECK-DAG: ST1D_IMM %{{[0-9]+}}, killed [[PTRUE_D]], %stack.0, 0
+; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, killed [[PTRUE_S]], %stack.1, 0
+; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0
+; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64common = ADDXri %stack.1, 0
+; CHECK-DAG: [[SP:%[0-9]+]]:gpr64sp = COPY $sp
+; CHECK-DAG: STRXui killed [[BASE1]], [[SP]], 0
+; CHECK-DAG: STRXui killed [[BASE2]], [[SP]], 1
+; CHECK:     BL @callee_with_many_gpr_sve_arg
+; CHECK:     RET_ReallyLR implicit $z0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_gpr_sve_arg(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 2 x i64> %z2, <vscale x 4 x i32> %z)
+  ret <vscale x 4 x i32> %ret
+}