[llvm] r342049 - [AArch64] Implement aarch64_vector_pcs codegen support.

Wed Sep 12 05:10:22 PDT 2018

Author: s.desmalen
Date: Wed Sep 12 05:10:22 2018
New Revision: 342049

URL: http://llvm.org/viewvc/llvm-project?rev=342049&view=rev
Log:
[AArch64] Implement aarch64_vector_pcs codegen support.

This patch adds codegen support for the saving/restoring
V8-V23 for functions specified with the aarch64_vector_pcs
calling convention attribute, as added in patch D51477.

Reviewers: t.p.northover, gberry, thegameg, rengolin, javed.absar, MatzeB

Reviewed By: thegameg

Differential Revision: https://reviews.llvm.org/D51479

Added:
    llvm/trunk/test/CodeGen/AArch64/aarch64-vector-pcs.mir
Modified:
    llvm/trunk/lib/Target/AArch64/AArch64CallingConvention.td
    llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/trunk/lib/Target/AArch64/AArch64RegisterInfo.cpp

Modified: llvm/trunk/lib/Target/AArch64/AArch64CallingConvention.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64CallingConvention.td?rev=342049&r1=342048&r2=342049&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64CallingConvention.td (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64CallingConvention.td Wed Sep 12 05:10:22 2018
@@ -288,6 +288,12 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
 
+// AArch64 PCS for vector functions (VPCS)
+// must (additionally) preserve full Q8-Q23 registers
+def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                          X23, X24, X25, X26, X27, X28,
+                                          (sequence "Q%u", 8, 23))>;
+
 // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
 // 'this' and the pointer return value are both passed in X0 in these cases,
 // this can be partially modelled by treating X0 as a callee-saved register;
@@ -362,5 +368,7 @@ def CSR_AArch64_AAPCS_SwiftError_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>;
 def CSR_AArch64_RT_MostRegs_SCS
     : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>;
+def CSR_AArch64_AAVPCS_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>;
 def CSR_AArch64_AAPCS_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>;

Modified: llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp?rev=342049&r1=342048&r2=342049&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp Wed Sep 12 05:10:22 2018
@@ -461,12 +461,19 @@ static MachineBasicBlock::iterator conve
     NewOpc = AArch64::STPDpre;
     Scale = 8;
     break;
+  case AArch64::STPQi:
+    NewOpc = AArch64::STPQpre;
+    Scale = 16;
+    break;
   case AArch64::STRXui:
     NewOpc = AArch64::STRXpre;
     break;
   case AArch64::STRDui:
     NewOpc = AArch64::STRDpre;
     break;
+  case AArch64::STRQui:
+    NewOpc = AArch64::STRQpre;
+    break;
   case AArch64::LDPXi:
     NewOpc = AArch64::LDPXpost;
     Scale = 8;
@@ -475,12 +482,19 @@ static MachineBasicBlock::iterator conve
     NewOpc = AArch64::LDPDpost;
     Scale = 8;
     break;
+  case AArch64::LDPQi:
+    NewOpc = AArch64::LDPQpost;
+    Scale = 16;
+    break;
   case AArch64::LDRXui:
     NewOpc = AArch64::LDRXpost;
     break;
   case AArch64::LDRDui:
     NewOpc = AArch64::LDRDpost;
     break;
+  case AArch64::LDRQui:
+    NewOpc = AArch64::LDRQpost;
+    break;
   }
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
@@ -531,6 +545,12 @@ static void fixupCalleeSaveRestoreStackO
   case AArch64::LDRDui:
     Scale = 8;
     break;
+  case AArch64::STPQi:
+  case AArch64::STRQui:
+  case AArch64::LDPQi:
+  case AArch64::LDRQui:
+    Scale = 16;
+    break;
   default:
     llvm_unreachable("Unexpected callee-save save/restore opcode!");
   }
@@ -541,7 +561,7 @@ static void fixupCalleeSaveRestoreStackO
   // Last operand is immediate offset that needs fixing.
   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
   // All generated opcodes have scaled offsets.
-  assert(LocalStackSize % 8 == 0);
+  assert(LocalStackSize % Scale == 0);
   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
 }
 
@@ -1208,7 +1228,7 @@ struct RegPairInfo {
   unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
-  enum RegType { GPR, FPR64 } Type;
+  enum RegType { GPR, FPR64, FPR128 } Type;
 
   RegPairInfo() = default;
 
@@ -1246,6 +1266,8 @@ static void computeCalleeSaveRegisterPai
       RPI.Type = RegPairInfo::GPR;
     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::FPR64;
+    else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
+      RPI.Type = RegPairInfo::FPR128;
     else
       llvm_unreachable("Unsupported register class.");
 
@@ -1261,6 +1283,10 @@ static void computeCalleeSaveRegisterPai
         if (AArch64::FPR64RegClass.contains(NextReg))
           RPI.Reg2 = NextReg;
         break;
+      case RegPairInfo::FPR128:
+        if (AArch64::FPR128RegClass.contains(NextReg))
+          RPI.Reg2 = NextReg;
+        break;
       }
     }
 
@@ -1294,17 +1320,21 @@ static void computeCalleeSaveRegisterPai
 
     RPI.FrameIdx = CSI[i].getFrameIdx();
 
-    if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
-      // Round up size of non-pair to pair size if we need to pad the
-      // callee-save area to ensure 16-byte alignment.
-      Offset -= 16;
+    int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8;
+    Offset -= RPI.isPaired() ? 2 * Scale : Scale;
+
+    // Round up size of non-pair to pair size if we need to pad the
+    // callee-save area to ensure 16-byte alignment.
+    if (AFI->hasCalleeSaveStackFreeSpace() &&
+        RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
+      Offset -= 8;
+      assert(Offset % 16 == 0);
       assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
       MFI.setObjectAlignment(RPI.FrameIdx, 16);
-      AFI->setCalleeSaveStackHasFreeSpace(true);
-    } else
-      Offset -= RPI.isPaired() ? 16 : 8;
-    assert(Offset % 8 == 0);
-    RPI.Offset = Offset / 8;
+    }
+
+    assert(Offset % Scale == 0);
+    RPI.Offset = Offset / Scale;
     assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
            "Offset out of bounds for LDP/STP immediate");
 
@@ -1370,6 +1400,11 @@ bool AArch64FrameLowering::spillCalleeSa
        Size = 8;
        Align = 8;
        break;
+    case RegPairInfo::FPR128:
+       StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
+       Size = 16;
+       Align = 16;
+       break;
     }
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -1441,6 +1476,11 @@ bool AArch64FrameLowering::restoreCallee
        Size = 8;
        Align = 8;
        break;
+    case RegPairInfo::FPR128:
+       LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
+       Size = 16;
+       Align = 16;
+       break;
     }
     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -1507,24 +1547,6 @@ void AArch64FrameLowering::determineCall
                                 ? RegInfo->getBaseRegister()
                                 : (unsigned)AArch64::NoRegister;
 
-  unsigned SpillEstimate = SavedRegs.count();
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    unsigned Reg = CSRegs[i];
-    unsigned PairedReg = CSRegs[i ^ 1];
-    if (Reg == BasePointerReg)
-      SpillEstimate++;
-    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg))
-      SpillEstimate++;
-  }
-  SpillEstimate += 2; // Conservatively include FP+LR in the estimate
-  unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate;
-
-  // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) {
-    SavedRegs.set(AArch64::FP);
-    SavedRegs.set(AArch64::LR);
-  }
-
   unsigned ExtraCSSpill = 0;
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -1548,7 +1570,8 @@ void AArch64FrameLowering::determineCall
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
-    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+    if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
+        !SavedRegs.test(PairedReg)) {
       SavedRegs.set(PairedReg);
       if (AArch64::GPR64RegClass.contains(PairedReg) &&
           !RegInfo->isReservedReg(MF, PairedReg))
@@ -1556,6 +1579,24 @@ void AArch64FrameLowering::determineCall
     }
   }
 
+  // Calculates the callee saved stack size.
+  unsigned CSStackSize = 0;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned Reg : SavedRegs.set_bits())
+    CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8;
+
+  // Save number of saved regs, so we can easily update CSStackSize later.
+  unsigned NumSavedRegs = SavedRegs.count();
+
+  // The frame record needs to be created by saving the appropriate registers
+  unsigned EstimatedStackSize = MFI.estimateStackSize(MF);
+  if (hasFP(MF) ||
+      windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
+    SavedRegs.set(AArch64::FP);
+    SavedRegs.set(AArch64::LR);
+  }
+
   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
              for (unsigned Reg
                   : SavedRegs.set_bits()) dbgs()
@@ -1563,15 +1604,12 @@ void AArch64FrameLowering::determineCall
              dbgs() << "\n";);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumRegsSpilled = SavedRegs.count();
-  bool CanEliminateFrame = NumRegsSpilled == 0;
+  bool CanEliminateFrame = SavedRegs.count() == 0;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
-  unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
-  LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
-  bool BigStack = (CFSize > EstimatedStackSizeLimit);
+  bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
     AFI->setHasStackFrame(true);
 
@@ -1592,7 +1630,6 @@ void AArch64FrameLowering::determineCall
       if (produceCompactUnwindFrame(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
       ExtraCSSpill = UnspilledCSGPRPaired;
-      NumRegsSpilled = SavedRegs.count();
     }
 
     // If we didn't find an extra callee-saved register to spill, create
@@ -1609,9 +1646,17 @@ void AArch64FrameLowering::determineCall
     }
   }
 
+  // Adding the size of additional 64bit GPR saves.
+  CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
+  unsigned AlignedCSStackSize = alignTo(CSStackSize, 16);
+  LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
+               << EstimatedStackSize + AlignedCSStackSize
+               << " bytes.\n");
+
   // Round up to register pair alignment to avoid additional SP adjustment
   // instructions.
-  AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+  AFI->setCalleeSavedStackSize(AlignedCSStackSize);
+  AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
 }
 
 bool AArch64FrameLowering::enableStackSlotScavenging(

Modified: llvm/trunk/lib/Target/AArch64/AArch64RegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64RegisterInfo.cpp?rev=342049&r1=342048&r2=342049&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64RegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64RegisterInfo.cpp Wed Sep 12 05:10:22 2018
@@ -50,8 +50,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(
   if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
-    // FIXME: default to AAPCS until we add full support.
-    return CSR_AArch64_AAPCS_SaveList;
+    return CSR_AArch64_AAVPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
     return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
            CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
@@ -102,8 +101,7 @@ AArch64RegisterInfo::getCallPreservedMas
     return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask
                : CSR_AArch64_CXX_TLS_Darwin_RegMask;
   if (CC == CallingConv::AArch64_VectorCall)
-    // FIXME: default to AAPCS until we add full support.
-    return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
+    return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
   if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
           ->supportSwiftError() &&
       MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))

Added: llvm/trunk/test/CodeGen/AArch64/aarch64-vector-pcs.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/aarch64-vector-pcs.mir?rev=342049&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-vector-pcs.mir (added)
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-vector-pcs.mir Wed Sep 12 05:10:22 2018
@@ -0,0 +1,253 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+
+# The tests below test the allocation of 128bit callee-saves
+# on the stack, specifically their offsets.
+
+# Padding of GPR64-registers is needed to ensure 16 byte alignment of
+# the stack pointer after the GPR64/FPR64 block (which is also needed
+# for the FPR128 saves when present).
+
+# This file also tests whether an emergency stack slot is allocated
+# when the stack frame is over a given size, caused by a series of
+# FPR128 saves. The alignment can leave a gap that can be scavenged
+# for stack slot scavenging, so it is important that the stack size
+# is properly estimated.
+
+
+--- |
+
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-linux-gnu"
+
+  ; Function Attrs: nounwind
+  define aarch64_vector_pcs void @test_q10_q11_x19() nounwind { entry: unreachable }
+
+  ; Function Attrs: nounwind
+  define aarch64_vector_pcs void @test_q10_q11_x19_x20() nounwind { entry: unreachable }
+
+  ; Function Attrs: nounwind
+  define aarch64_vector_pcs void @test_q10_q11_x19_x20_x21() nounwind { entry: unreachable }
+
+  ; Function Attrs: nounwind
+  define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30() nounwind { entry: unreachable }
+
+  ; Function Attrs: nounwind
+  define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30_preinc() nounwind { entry: unreachable }
+
+...
+---
+name:            test_q10_q11_x19
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+  $x19 = IMPLICIT_DEF
+  $q10 = IMPLICIT_DEF
+  $q11 = IMPLICIT_DEF
+
+  ; Check that the alignment gap for the 8-byte x19 is padded
+  ; with another 8 bytes. The CSR region will look like this:
+  ;    +-------------------+
+  ;    |/////padding///////|        (8 bytes)
+  ;    |       X19         |        (8 bytes)
+  ;    +-------------------+ <-  SP -16
+  ;    |     Q10, Q11      |        (32 bytes)
+  ;    +-------------------+ <-  SP -48
+
+  ; CHECK-LABEL: test_q10_q11_x19{{[[:space:]]}}
+  ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -3 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]])
+  ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -48, size: 16, alignment: 16
+  ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -32, size: 16, alignment: 16
+  ; CHECK-DAG: frame-setup STRXui killed $x19, $sp, 4 :: (store 8 into %stack.[[X19:[0-9]+]])
+  ; CHECK-DAG: - { id: [[X19]], {{.*}}, offset: -16, size: 8, alignment: 16
+
+...
+---
+name:            test_q10_q11_x19_x20
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+  $x19 = IMPLICIT_DEF
+  $x20 = IMPLICIT_DEF
+  $q10 = IMPLICIT_DEF
+  $q11 = IMPLICIT_DEF
+
+  ;    +-------------------+
+  ;    |     X19, X20      |        (16 bytes)
+  ;    +-------------------+ <-  SP -16
+  ;    |     Q10, Q11      |        (32 bytes)
+  ;    +-------------------+ <-  SP -48
+
+  ; CHECK-LABEL: test_q10_q11_x19_x20{{[[:space:]]}}
+  ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -3 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]])
+  ; CHECK-DAG: frame-setup STPXi killed $x20, killed $x19, $sp, 4 :: (store 8 into %stack.[[X20:[0-9]+]]), (store 8 into %stack.[[X19:[0-9]+]])
+  ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -48, size: 16, alignment: 16
+  ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -32, size: 16, alignment: 16
+  ; CHECK-DAG: - { id: [[X20]], {{.*}}, offset: -16, size: 8, alignment: 8
+  ; CHECK-DAG: - { id: [[X19]], {{.*}}, offset:  -8, size: 8, alignment: 8
+
+...
+---
+name:            test_q10_q11_x19_x20_x21
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+  $x19 = IMPLICIT_DEF
+  $x20 = IMPLICIT_DEF
+  $x21 = IMPLICIT_DEF
+  $q10 = IMPLICIT_DEF
+  $q11 = IMPLICIT_DEF
+
+  ; Check that the alignment gap is padded with another 8 bytes.
+  ; The CSR region will look like this:
+  ;    +-------------------+
+  ;    |     X19, X20      |        (16 bytes)
+  ;    +-------------------+ <-  SP -16
+  ;    |/////padding///////|        (8 bytes)
+  ;    |        X21        |        (8 bytes)
+  ;    +-------------------+ <-  SP -32
+  ;    |     Q10, Q11      |        (32 bytes)
+  ;    +-------------------+ <-  SP -64
+
+  ; CHECK-LABEL: test_q10_q11_x19_x20_x21
+  ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -4 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]])
+  ; CHECK-DAG: frame-setup STRXui killed $x21, $sp, 4 :: (store 8 into %stack.[[X21:[0-9]+]])
+  ; CHECK-DAG: frame-setup STPXi killed $x20, killed $x19, $sp, 6
+  ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -64, size: 16, alignment: 16
+  ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -48, size: 16, alignment: 16
+  ; CHECK-DAG: - { id: [[X21]], {{.*}}, offset: -32, size: 8, alignment: 16
+
+...
+---
+name:            test_q8_to_q23_x19_to_x30
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+  $x19 = IMPLICIT_DEF
+  $x20 = IMPLICIT_DEF
+  $x21 = IMPLICIT_DEF
+  $x22 = IMPLICIT_DEF
+  $x23 = IMPLICIT_DEF
+  $x24 = IMPLICIT_DEF
+  $x25 = IMPLICIT_DEF
+  $x26 = IMPLICIT_DEF
+  $x27 = IMPLICIT_DEF
+  $x28 = IMPLICIT_DEF
+  $fp = IMPLICIT_DEF
+  $lr = IMPLICIT_DEF
+  $q8 = IMPLICIT_DEF
+  $q9 = IMPLICIT_DEF
+  $q10 = IMPLICIT_DEF
+  $q11 = IMPLICIT_DEF
+  $q12 = IMPLICIT_DEF
+  $q13 = IMPLICIT_DEF
+  $q14 = IMPLICIT_DEF
+  $q15 = IMPLICIT_DEF
+  $q16 = IMPLICIT_DEF
+  $q17 = IMPLICIT_DEF
+  $q18 = IMPLICIT_DEF
+  $q19 = IMPLICIT_DEF
+  $q20 = IMPLICIT_DEF
+  $q21 = IMPLICIT_DEF
+  $q22 = IMPLICIT_DEF
+  $q23 = IMPLICIT_DEF
+
+  ; Test with more callee saves, which triggers 'BigStack' in
+  ; AArch64FrameLowering which in turn causes an emergency spill
+  ; slot to be allocated. The emergency spill slot is allocated
+  ; as close as possible to SP, so at SP + 0.
+  ;    +-------------------+
+  ;    |     X19..X30      |        (96 bytes)
+  ;    +-------------------+ <-  SP -96
+  ;    |      Q8..Q23      |        (256 bytes)
+  ;    +-------------------+ <-  SP -352
+  ;    |   emergency slot  |        (16 bytes)
+  ;    +-------------------+ <-  SP -368
+
+  ; CHECK-LABEL: test_q8_to_q23_x19_to_x30
+  ; CHECK: $sp = frame-setup SUBXri $sp, 368, 0
+  ; CHECK-NEXT: frame-setup STPQi killed $q23, killed $q22, $sp, 1 :: (store 16 into %stack.{{[0-9]+}}), (store 16 into %stack.{{[0-9]+}})
+  ; CHECK-NEXT: frame-setup STPQi killed $q21, killed $q20, $sp, 3
+  ; CHECK-NEXT: frame-setup STPQi killed $q19, killed $q18, $sp, 5
+  ; CHECK-NEXT: frame-setup STPQi killed $q17, killed $q16, $sp, 7
+  ; CHECK-NEXT: frame-setup STPQi killed $q15, killed $q14, $sp, 9
+  ; CHECK-NEXT: frame-setup STPQi killed $q13, killed $q12, $sp, 11
+  ; CHECK-NEXT: frame-setup STPQi killed $q11, killed $q10, $sp, 13
+  ; CHECK-NEXT: frame-setup STPQi killed $q9, killed $q8, $sp, 15
+  ; CHECK-NEXT: frame-setup STPXi killed $x28, killed $x27, $sp, 34 :: (store 8 into %stack.{{[0-9]+}}), (store 8 into %stack.{{[0-9]+}})
+  ; CHECK-NEXT: frame-setup STPXi killed $x26, killed $x25, $sp, 36
+  ; CHECK-NEXT: frame-setup STPXi killed $x24, killed $x23, $sp, 38
+  ; CHECK-NEXT: frame-setup STPXi killed $x22, killed $x21, $sp, 40
+  ; CHECK-NEXT: frame-setup STPXi killed $x20, killed $x19, $sp, 42
+  ; CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 44
+
+...
+---
+name:            test_q8_to_q23_x19_to_x30_preinc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 160, alignment: 4, local-offset: 0 }
+constants:
+body:             |
+  bb.0.entry:
+  $x19 = IMPLICIT_DEF
+  $x20 = IMPLICIT_DEF
+  $x21 = IMPLICIT_DEF
+  $x22 = IMPLICIT_DEF
+  $x23 = IMPLICIT_DEF
+  $x24 = IMPLICIT_DEF
+  $x25 = IMPLICIT_DEF
+  $x26 = IMPLICIT_DEF
+  $x27 = IMPLICIT_DEF
+  $x28 = IMPLICIT_DEF
+  $fp = IMPLICIT_DEF
+  $lr = IMPLICIT_DEF
+  $q8 = IMPLICIT_DEF
+  $q9 = IMPLICIT_DEF
+  $q10 = IMPLICIT_DEF
+  $q11 = IMPLICIT_DEF
+  $q12 = IMPLICIT_DEF
+  $q13 = IMPLICIT_DEF
+  $q14 = IMPLICIT_DEF
+  $q15 = IMPLICIT_DEF
+  $q16 = IMPLICIT_DEF
+  $q17 = IMPLICIT_DEF
+  $q18 = IMPLICIT_DEF
+  $q19 = IMPLICIT_DEF
+  $q20 = IMPLICIT_DEF
+  $q21 = IMPLICIT_DEF
+  $q22 = IMPLICIT_DEF
+  $q23 = IMPLICIT_DEF
+
+  ; When the total stack size >= 512, it will use the pre-increment
+  ; rather than the 'sub sp, sp, <size>'.
+  ;    +-------------------+
+  ;    |     X19..X30      |        (96 bytes)
+  ;    +-------------------+ <-  SP -96
+  ;    |      Q8..Q23      |        (256 bytes)
+  ;    +-------------------+ <-  SP -352
+  ;    |       'obj'       |        (32 bytes)
+  ;    +-------------------+ <-  SP -384
+  ;    |   emergency slot  |        (16 bytes)
+  ;    +-------------------+ <-  SP -400
+
+  ; CHECK-LABEL: test_q8_to_q23_x19_to_x30_preinc
+  ; CHECK: $sp = frame-setup STPQpre killed $q23, killed $q22, $sp, -22 :: (store 16 into %stack.{{[0-9]+}}), (store 16 into %stack.{{[0-9]+}})
+  ; CHECK-NEXT: frame-setup STPQi killed $q21, killed $q20, $sp, 2
+  ; CHECK-NEXT: frame-setup STPQi killed $q19, killed $q18, $sp, 4
+  ; CHECK-NEXT: frame-setup STPQi killed $q17, killed $q16, $sp, 6
+  ; CHECK-NEXT: frame-setup STPQi killed $q15, killed $q14, $sp, 8
+  ; CHECK-NEXT: frame-setup STPQi killed $q13, killed $q12, $sp, 10
+  ; CHECK-NEXT: frame-setup STPQi killed $q11, killed $q10, $sp, 12
+  ; CHECK-NEXT: frame-setup STPQi killed $q9, killed $q8, $sp, 14
+  ; CHECK-NEXT: frame-setup STPXi killed $x28, killed $x27, $sp, 32 :: (store 8 into %stack.{{[0-9]+}}), (store 8 into %stack.{{[0-9]+}})
+  ; CHECK-NEXT: frame-setup STPXi killed $x26, killed $x25, $sp, 34
+  ; CHECK-NEXT: frame-setup STPXi killed $x24, killed $x23, $sp, 36
+  ; CHECK-NEXT: frame-setup STPXi killed $x22, killed $x21, $sp, 38
+  ; CHECK-NEXT: frame-setup STPXi killed $x20, killed $x19, $sp, 40
+  ; CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 42
+  ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 176, 0
+
+...