[llvm] 84a0c8e - [AArch64][SVE] Spilling/filling of SVE callee-saves.

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 11 01:05:00 PST 2019


Author: Sander de Smalen
Date: 2019-11-11T09:03:19Z
New Revision: 84a0c8e3ae92829c4f04ba995b4b6283d397f65d

URL: https://github.com/llvm/llvm-project/commit/84a0c8e3ae92829c4f04ba995b4b6283d397f65d
DIFF: https://github.com/llvm/llvm-project/commit/84a0c8e3ae92829c4f04ba995b4b6283d397f65d.diff

LOG: [AArch64][SVE] Spilling/filling of SVE callee-saves.

Implement the spills/fills of callee-saved SVE registers using STR and LDR
instructions.

Also adds the `aarch64_sve_vector_pcs` attribute to specify the
callee-saved registers to be used for functions that return SVE vectors or
take SVE vectors as arguments. The callee-saved registers are vector
registers z8-z23 and predicate registers p4-p15.

The overal frame-layout with SVE will be as follows:

   +-------------+
   | stack args  |
   +-------------+
   | Callee Saves|
   |   X29, X30  |
   |-------------| <- FP
   | SVE Callee  | < //////////////
   | saved regs  | < //////////////
   |    z23      | < //////////////
   |     :       | < // SCALABLE //
   |    z8       | < //////////////
   |    p15      | < /// STACK ////
   |     :       | < //////////////
   |    p4       | < //// AREA ////
   +-------------+ < //////////////
   |     :       | < //////////////
   |  SVE locals | < //////////////
   |     :       | < //////////////
   +-------------+
   |/////////////| alignment gap.
   |     :       |
   | Stack objs  |
   |     :       |
   +-------------+ <- SP after call and frame-setup

Reviewers: cameron.mcinally, efriedma, greened, thegameg, ostannard, rengolin

Reviewed By: ostannard

Differential Revision: https://reviews.llvm.org/D68996

Added: 
    

Modified: 
    llvm/lib/AsmParser/LLLexer.cpp
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/AsmParser/LLToken.h
    llvm/lib/IR/AsmWriter.cpp
    llvm/lib/Target/AArch64/AArch64CallingConvention.td
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/lib/Target/AArch64/AArch64FrameLowering.h
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
    llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
    llvm/test/CodeGen/AArch64/framelayout-sve.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 887a931c64d2..847ca0430e67 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -594,6 +594,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
   KEYWORD(aarch64_vector_pcs);
+  KEYWORD(aarch64_sve_vector_pcs);
   KEYWORD(msp430_intrcc);
   KEYWORD(avr_intrcc);
   KEYWORD(avr_signalcc);

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index e9d2dfc195b1..bb2c65f6d9a6 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1931,6 +1931,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
 ///   ::= 'aarch64_vector_pcs'
+///   ::= 'aarch64_sve_vector_pcs'
 ///   ::= 'msp430_intrcc'
 ///   ::= 'avr_intrcc'
 ///   ::= 'avr_signalcc'
@@ -1977,6 +1978,9 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
   case lltok::kw_aarch64_vector_pcs:CC = CallingConv::AArch64_VectorCall; break;
+  case lltok::kw_aarch64_sve_vector_pcs:
+    CC = CallingConv::AArch64_SVE_VectorCall;
+    break;
   case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
   case lltok::kw_avr_intrcc:     CC = CallingConv::AVR_INTR; break;
   case lltok::kw_avr_signalcc:   CC = CallingConv::AVR_SIGNAL; break;

diff  --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index 9153b49aa045..9029e15af9fd 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -142,6 +142,7 @@ enum Kind {
   kw_arm_aapcscc,
   kw_arm_aapcs_vfpcc,
   kw_aarch64_vector_pcs,
+  kw_aarch64_sve_vector_pcs,
   kw_msp430_intrcc,
   kw_avr_intrcc,
   kw_avr_signalcc,

diff  --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 107b32ea3263..5ee0d52fe995 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -364,6 +364,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
   case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
   case CallingConv::AArch64_VectorCall: Out << "aarch64_vector_pcs"; break;
+  case CallingConv::AArch64_SVE_VectorCall:
+    Out << "aarch64_sve_vector_pcs";
+    break;
   case CallingConv::MSP430_INTR:   Out << "msp430_intrcc"; break;
   case CallingConv::AVR_INTR:      Out << "avr_intrcc "; break;
   case CallingConv::AVR_SIGNAL:    Out << "avr_signalcc "; break;

diff  --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 3c4121b1185e..3c179ae76f0e 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -405,10 +405,10 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
 
 // Functions taking SVE arguments or returning an SVE type
 // must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
-def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
-                                               X25, X26, X27, X28, LR, FP,
-                                               (sequence "Z%u", 8, 23),
-                                               (sequence "P%u", 4, 15))>;
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23),
+                                                 (sequence "P%u", 4, 15),
+                                                 X19, X20, X21, X22, X23, X24,
+                                                 X25, X26, X27, X28, LR, FP)>;
 
 // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
 // 'this' and the pointer return value are both passed in X0 in these cases,
@@ -486,5 +486,7 @@ def CSR_AArch64_RT_MostRegs_SCS
     : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>;
 def CSR_AArch64_AAVPCS_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>;
+def CSR_AArch64_SVE_AAPCS_SCS
+    : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>;
 def CSR_AArch64_AAPCS_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>;

diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index bbd7c51fde94..afe4f1402cf3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -834,6 +834,20 @@ static bool isTargetDarwin(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
 }
 
+// Convenience function to determine whether I is an SVE callee save.
+bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STR_ZXI:
+  case AArch64::STR_PXI:
+  case AArch64::LDR_ZXI:
+  case AArch64::LDR_PXI:
+    return I->getFlag(MachineInstr::FrameSetup) ||
+           I->getFlag(MachineInstr::FrameDestroy);
+  }
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -965,7 +979,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // and pre-inc if we decided to combine the callee-save and local stack
   // pointer bump above.
   MachineBasicBlock::iterator End = MBB.end();
-  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
+         !IsSVECalleeSave(MBBI)) {
     if (CombineSPBump)
       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
                                         NeedsWinCFI, &HasWinCFI);
@@ -1107,7 +1122,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     NumBytes = 0;
   }
 
-  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+  MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
+
+  // Process the SVE callee-saves to determine what space needs to be
+  // allocated.
+  if (AFI->getSVECalleeSavedStackSize()) {
+    // Find callee save instructions in frame.
+    CalleeSavesBegin = MBBI;
+    assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
+    while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
+      ++MBBI;
+    CalleeSavesEnd = MBBI;
+
+    int64_t OffsetToFirstCalleeSaveFromSP =
+        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
+    StackOffset OffsetToCalleeSavesFromSP =
+        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
+    AllocateBefore -= OffsetToCalleeSavesFromSP;
+    AllocateAfter = SVEStackSize - AllocateBefore;
+  }
+
+  // Allocate space for the callee saves (if any).
+  emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
+                  -AllocateBefore, TII,
+                  MachineInstr::FrameSetup);
+
+  // Finally allocate remaining SVE stack space.
+  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
+                  -AllocateAfter, TII,
                   MachineInstr::FrameSetup);
 
   // Allocate space for the rest of the frame.
@@ -1444,7 +1487,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator Begin = MBB.begin();
   while (LastPopI != Begin) {
     --LastPopI;
-    if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
+    if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
+        IsSVECalleeSave(LastPopI)) {
       ++LastPopI;
       break;
     } else if (CombineSPBump)
@@ -1476,11 +1520,53 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
+  // Process the SVE callee-saves to determine what space needs to be
+  // deallocated.
+  StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
+  MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
+  if (AFI->getSVECalleeSavedStackSize()) {
+    RestoreBegin = std::prev(RestoreEnd);;
+    while (IsSVECalleeSave(RestoreBegin) &&
+           RestoreBegin != MBB.begin())
+      --RestoreBegin;
+    ++RestoreBegin;
+
+    assert(IsSVECalleeSave(RestoreBegin) &&
+           IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+
+    int64_t OffsetToFirstCalleeSaveFromSP =
+        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
+    StackOffset OffsetToCalleeSavesFromSP =
+        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
+    DeallocateBefore = OffsetToCalleeSavesFromSP;
+    DeallocateAfter = SVEStackSize - DeallocateBefore;
+  }
+
   // Deallocate the SVE area.
-  if (SVEStackSize)
-    if (!AFI->isStackRealigned())
-      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
-                      TII, MachineInstr::FrameDestroy);
+  if (SVEStackSize) {
+    if (AFI->isStackRealigned()) {
+      if (AFI->getSVECalleeSavedStackSize())
+        // Set SP to start of SVE area, from which the callee-save reloads
+        // can be done. The code below will deallocate the stack space
+        // space by moving FP -> SP.
+        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
+                        -SVEStackSize, TII, MachineInstr::FrameDestroy);
+    } else {
+      if (AFI->getSVECalleeSavedStackSize()) {
+        // Deallocate the non-SVE locals first before we can deallocate (and
+        // restore callee saves) from the SVE area.
+        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                        {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
+        NumBytes = 0;
+      }
+
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                      DeallocateBefore, TII, MachineInstr::FrameDestroy);
+
+      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                      DeallocateAfter, TII, MachineInstr::FrameDestroy);
+    }
+  }
 
   if (!hasFP(MF)) {
     bool RedZone = canUseRedZone(MF);
@@ -1813,11 +1899,28 @@ struct RegPairInfo {
   unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
-  enum RegType { GPR, FPR64, FPR128 } Type;
+  enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
 
   RegPairInfo() = default;
 
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+
+  unsigned getScale() const {
+    switch (Type) {
+    case PPR:
+      return 2;
+    case GPR:
+    case FPR64:
+      return 8;
+    case ZPR:
+    case FPR128:
+      return 16;
+    default:
+      llvm_unreachable("Unsupported type");
+    }
+  }
+
+  bool isScalable() const { return Type == PPR || Type == ZPR; }
 };
 
 } // end anonymous namespace
@@ -1842,7 +1945,8 @@ static void computeCalleeSaveRegisterPairs(
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
-  int Offset = AFI->getCalleeSavedStackSize();
+  int ByteOffset = AFI->getCalleeSavedStackSize();
+  int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
   // On Linux, we will have either one or zero non-paired register.  On Windows
   // with CFI, we can have multiple unpaired registers in order to utilize the
   // available unwind codes.  This flag assures that the alignment fixup is done
@@ -1858,6 +1962,10 @@ static void computeCalleeSaveRegisterPairs(
       RPI.Type = RegPairInfo::FPR64;
     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::FPR128;
+    else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
+      RPI.Type = RegPairInfo::ZPR;
+    else if (AArch64::PPRRegClass.contains(RPI.Reg1))
+      RPI.Type = RegPairInfo::PPR;
     else
       llvm_unreachable("Unsupported register class.");
 
@@ -1880,6 +1988,9 @@ static void computeCalleeSaveRegisterPairs(
         if (AArch64::FPR128RegClass.contains(NextReg))
           RPI.Reg2 = NextReg;
         break;
+      case RegPairInfo::PPR:
+      case RegPairInfo::ZPR:
+        break;
       }
     }
 
@@ -1917,23 +2028,33 @@ static void computeCalleeSaveRegisterPairs(
 
     RPI.FrameIdx = CSI[i].getFrameIdx();
 
-    int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8;
-    Offset -= RPI.isPaired() ? 2 * Scale : Scale;
+    int Scale = RPI.getScale();
+    if (RPI.isScalable())
+      ScalableByteOffset -= Scale;
+    else
+      ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;
+
+    assert(!(RPI.isScalable() && RPI.isPaired()) &&
+           "Paired spill/fill instructions don't exist for SVE vectors");
 
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
     if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
-        RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
+        !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
+        !RPI.isPaired()) {
       FixupDone = true;
-      Offset -= 8;
-      assert(Offset % 16 == 0);
+      ByteOffset -= 8;
+      assert(ByteOffset % 16 == 0);
       assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
       MFI.setObjectAlignment(RPI.FrameIdx, 16);
     }
 
+    int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
     assert(Offset % Scale == 0);
     RPI.Offset = Offset / Scale;
-    assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+
+    assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
+            (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
            "Offset out of bounds for LDP/STP immediate");
 
     RegPairs.push_back(RPI);
@@ -2025,6 +2146,16 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
        Size = 16;
        Align = 16;
        break;
+    case RegPairInfo::ZPR:
+       StrOpc = AArch64::STR_ZXI;
+       Size = 16;
+       Align = 16;
+       break;
+    case RegPairInfo::PPR:
+       StrOpc = AArch64::STR_PXI;
+       Size = 2;
+       Align = 2;
+       break;
     }
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -2065,6 +2196,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
 
+    // Update the StackIDs of the SVE stack slots.
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
+      MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);
+
   }
   return true;
 }
@@ -2116,6 +2252,16 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Size = 16;
        Align = 16;
        break;
+    case RegPairInfo::ZPR:
+       LdrOpc = AArch64::LDR_ZXI;
+       Size = 16;
+       Align = 16;
+       break;
+    case RegPairInfo::PPR:
+       LdrOpc = AArch64::LDR_PXI;
+       Size = 2;
+       Align = 2;
+       break;
     }
     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -2150,12 +2296,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
   };
-  if (ReverseCSRRestoreSeq)
-    for (const RegPairInfo &RPI : reverse(RegPairs))
+
+  // SVE objects are always restored in reverse order.
+  for (const RegPairInfo &RPI : reverse(RegPairs))
+    if (RPI.isScalable())
       EmitMI(RPI);
-  else
+
+  if (ReverseCSRRestoreSeq) {
+    for (const RegPairInfo &RPI : reverse(RegPairs))
+      if (!RPI.isScalable())
+        EmitMI(RPI);
+  } else
     for (const RegPairInfo &RPI : RegPairs)
-      EmitMI(RPI);
+      if (!RPI.isScalable())
+        EmitMI(RPI);
 
   if (NeedShadowCallStackProlog) {
     // Shadow call stack epilog: ldr x30, [x18, #-8]!
@@ -2202,7 +2356,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(Reg);
 
     bool RegUsed = SavedRegs.test(Reg);
-    unsigned PairedReg = CSRegs[i ^ 1];
+    unsigned PairedReg = AArch64::NoRegister;
+    if (AArch64::GPR64RegClass.contains(Reg) ||
+        AArch64::FPR64RegClass.contains(Reg) ||
+        AArch64::FPR128RegClass.contains(Reg))
+      PairedReg = CSRegs[i ^ 1];
+
     if (!RegUsed) {
       if (AArch64::GPR64RegClass.contains(Reg) &&
           !RegInfo->isReservedReg(MF, Reg)) {
@@ -2226,10 +2385,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
+  unsigned SVECSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (unsigned Reg : SavedRegs.set_bits())
-    CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8;
+  for (unsigned Reg : SavedRegs.set_bits()) {
+    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+    if (AArch64::PPRRegClass.contains(Reg) ||
+        AArch64::ZPRRegClass.contains(Reg))
+      SVECSStackSize += RegSize;
+    else
+      CSStackSize += RegSize;
+  }
 
   // Save number of saved regs, so we can easily update CSStackSize later.
   unsigned NumSavedRegs = SavedRegs.count();
@@ -2249,10 +2415,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
              dbgs() << "\n";);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned MaxAlign = getStackAlignment();
   int64_t SVEStackSize =
-      alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign);
-  assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+      alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
@@ -2313,6 +2477,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // instructions.
   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
+  AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
 }
 
 bool AArch64FrameLowering::enableStackSlotScavenging(
@@ -2321,9 +2486,39 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
   return AFI->hasCalleeSaveStackFreeSpace();
 }
 
-int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
-                                                    unsigned &MaxAlign) const {
-  // Process all fixed stack objects.
+/// returns true if there are any SVE callee saves.
+static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
+                                      int &Min, int &Max) {
+  if (!MFI.isCalleeSavedInfoValid())
+    return false;
+
+  Min = std::numeric_limits<int>::max();
+  Max = std::numeric_limits<int>::min();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  for (auto &CS : CSI) {
+    if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
+        AArch64::PPRRegClass.contains(CS.getReg())) {
+      assert((Max == std::numeric_limits<int>::min() ||
+              Max + 1 == CS.getFrameIdx()) &&
+             "SVE CalleeSaves are not consecutive");
+
+      Min = std::min(Min, CS.getFrameIdx());
+      Max = std::max(Max, CS.getFrameIdx());
+    }
+  }
+  return Min != std::numeric_limits<int>::max();
+}
+
+// Process all the SVE stack objects and determine offsets for each
+// object. If AssignOffsets is true, the offsets get assigned.
+// Fills in the first and last callee-saved frame indices into
+// Min/MaxCSFrameIndex, respectively.
+// Returns the size of the stack.
+static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
+                                              int &MinCSFrameIndex,
+                                              int &MaxCSFrameIndex,
+                                              bool AssignOffsets) {
+  // First process all fixed stack objects.
   int64_t Offset = 0;
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
     if (MFI.getStackID(I) == TargetStackID::SVEVector) {
@@ -2332,12 +2527,41 @@ int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
         Offset = FixedOffset;
     }
 
+  // Then process all callee saved slots.
+  if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
+    // Make sure to align the last callee save slot.
+    MFI.setObjectAlignment(MaxCSFrameIndex, 16U);
+
+    // Assign offsets to the callee save slots.
+    for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
+      Offset += MFI.getObjectSize(I);
+      Offset = alignTo(Offset, MFI.getObjectAlignment(I));
+      if (AssignOffsets) {
+        LLVM_DEBUG(dbgs() << "alloc FI(" << I << ") at SP[" << Offset
+                          << "]\n");
+        MFI.setObjectOffset(I, -Offset);
+      }
+    }
+  }
+
   // Note: We don't take allocatable stack objects into
   // account yet, because allocation for those is not yet
   // implemented.
   return Offset;
 }
 
+int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
+    MachineFrameInfo &MFI) const {
+  int MinCSFrameIndex, MaxCSFrameIndex;
+  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
+}
+
+int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
+    MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
+  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
+                                        true);
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -2345,12 +2569,13 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
          "Upwards growing stack unsupported");
 
-  unsigned MaxAlign = getStackAlignment();
-  int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign);
+  int MinCSFrameIndex, MaxCSFrameIndex;
+  int64_t SVEStackSize =
+      assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
 
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign));
-  assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+  AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
+  AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
 
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.

diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index ac150e86c9eb..f84847def34d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -101,7 +101,11 @@ class AArch64FrameLowering : public TargetFrameLowering {
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       unsigned StackBumpBytes) const;
-  int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const;
+
+  int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
+  int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
+                                      int &MinCSFrameIndex,
+                                      int &MaxCSFrameIndex) const;
 };
 
 } // End llvm namespace

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9705a1b94615..2a3b3a3ac2f8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3118,6 +3118,9 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention.");
+  case CallingConv::AArch64_SVE_VectorCall:
+    // Calling SVE functions is currently not yet supported.
+    report_fatal_error("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 32661860934a..dc9ca277b47f 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -53,8 +53,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
 
+  /// The start and end frame indices for the SVE callee saves.
+  int MinSVECSFrameIndex;
+  int MaxSVECSFrameIndex;
+
   /// Amount of stack frame size used for saving callee-saved registers.
   unsigned CalleeSavedStackSize;
+  unsigned SVECalleeSavedStackSize;
   bool HasCalleeSavedStackSize = false;
 
   /// Number of TLS accesses using the special (combinable)
@@ -161,7 +166,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   void setCalleeSaveStackHasFreeSpace(bool s) {
     CalleeSaveStackHasFreeSpace = s;
   }
-
   bool isSplitCSR() const { return IsSplitCSR; }
   void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 
@@ -218,6 +222,22 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     return CalleeSavedStackSize;
   }
 
+  // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
+  void setSVECalleeSavedStackSize(unsigned Size) {
+    SVECalleeSavedStackSize = Size;
+  }
+  unsigned getSVECalleeSavedStackSize() const {
+    return SVECalleeSavedStackSize;
+  }
+
+  void setMinMaxSVECSFrameIndex(int Min, int Max) {
+    MinSVECSFrameIndex = Min;
+    MaxSVECSFrameIndex = Max;
+  }
+
+  int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; }
+  int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; }
+
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
     return NumLocalDynamicTLSAccesses;

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 918e89fd8868..47ccef5ed83f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -55,6 +55,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
     return CSR_AArch64_AAVPCS_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
+    return CSR_AArch64_SVE_AAPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
     return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
            CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
@@ -125,7 +127,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (CC == CallingConv::AArch64_VectorCall)
     return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
   if (CC == CallingConv::AArch64_SVE_VectorCall)
-    return CSR_AArch64_SVE_AAPCS_RegMask;
+    return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask
+               : CSR_AArch64_SVE_AAPCS_RegMask;
   if (CC == CallingConv::CFGuard_Check)
     return CSR_Win_AArch64_CFGuard_Check_RegMask;
   if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()

diff  --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 452ae94e2456..18d6796b172c 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -30,6 +30,10 @@
   define void @test_address_sve_fp() nounwind { entry: unreachable }
   define void @test_stack_arg_sve() nounwind { entry: unreachable }
   define void @test_address_sve_out_of_range() nounwind { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_pregs_sve() nounwind { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_zregs_sve() nounwind { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_sve() nounwind { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_sve_realign() nounwind { entry: unreachable }
 
 ...
 # +----------+
@@ -328,3 +332,183 @@ body:             |
 
     RET_ReallyLR
 ---
+...
+# CHECK-LABEL: name: save_restore_pregs_sve
+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1
+# CHECK: frame-setup STR_PXI killed $p6, $sp, 5
+# CHECK: frame-setup STR_PXI killed $p5, $sp, 6
+# CHECK: frame-setup STR_PXI killed $p4, $sp, 7
+# CHECK: $sp = frame-setup SUBXri $sp, 32, 0
+
+# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
+# CHECK: $p6 = frame-destroy LDR_PXI $sp, 5
+# CHECK: $p5 = frame-destroy LDR_PXI $sp, 6
+# CHECK: $p4 = frame-destroy LDR_PXI $sp, 7
+# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
+# CHECK: RET_ReallyLR
+name: save_restore_pregs_sve
+stack:
+  - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body:             |
+  bb.0.entry:
+
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+
+    RET_ReallyLR
+---
+...
+# CHECK-LABEL: name: save_restore_zregs_sve
+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -3
+# CHECK: frame-setup STR_ZXI killed $z10, $sp, 0
+# CHECK: frame-setup STR_ZXI killed $z9, $sp, 1
+# CHECK: frame-setup STR_ZXI killed $z8, $sp, 2
+# CHECK: $sp = frame-setup SUBXri $sp, 32, 0
+
+# CHECK: $sp  = frame-destroy ADDXri $sp, 32, 0
+# CHECK: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK: $z9  = frame-destroy LDR_ZXI $sp, 1
+# CHECK: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK: $sp  = frame-destroy ADDVL_XXI $sp, 3
+# CHECK: RET_ReallyLR
+name: save_restore_zregs_sve
+stack:
+  - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body:             |
+  bb.0.entry:
+
+    $z8 = IMPLICIT_DEF
+    $z9 = IMPLICIT_DEF
+    $z10 = IMPLICIT_DEF
+
+    RET_ReallyLR
+---
+...
+# Test allocation/deallocation of the stack frame together with the
+# saving/restoring of callee save registers. Fixed-stack objects
+# are allocated before the callee-saves.
+# This also adds some non-SVE callee-saves, to ensure that those are
+# paired correctly.
+#
+# CHECK-LABEL: name: save_restore_sve
+# CHECK: $sp = frame-setup STPXpre killed ${{[a-z0-9]+}}, killed $x21, $sp, -4
+# CHECK: frame-setup STPXi killed $x20, killed $x19, $sp, 2
+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -19
+# CHECK: frame-setup STR_PXI killed $p15, $sp, 4
+# CHECK: frame-setup STR_PXI killed $p14, $sp, 5
+# CHECK: frame-setup STR_PXI killed $p5, $sp, 14
+# CHECK: frame-setup STR_PXI killed $p4, $sp, 15
+# CHECK: frame-setup STR_ZXI killed $z23, $sp, 2
+# CHECK: frame-setup STR_ZXI killed $z22, $sp, 3
+# CHECK: frame-setup STR_ZXI killed $z9, $sp, 16
+# CHECK: frame-setup STR_ZXI killed $z8, $sp, 17
+# CHECK: $sp = frame-setup SUBXri $sp, 32, 0
+
+# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
+# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 19
+# CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2
+# CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4
+# CHECK: RET_ReallyLR
+name: save_restore_sve
+fixedStack:
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 }
+stack:
+  - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body:             |
+  bb.0.entry:
+
+    $z8_z9_z10_z11   = IMPLICIT_DEF
+    $z12_z13_z14_z15 = IMPLICIT_DEF
+    $z16_z17_z18_z19 = IMPLICIT_DEF
+    $z20_z21_z22_z23 = IMPLICIT_DEF
+    $z24_z25_z26_z27 = IMPLICIT_DEF
+    $z28_z29_z30_z31 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $x19 = IMPLICIT_DEF
+    $x20 = IMPLICIT_DEF
+    $x21 = IMPLICIT_DEF
+
+    RET_ReallyLR
+---
+...
+# Test allocation/deallocation of the stack frame together with the
+# saving/restoring of callee save registers. Fixed-stack objects
+# are allocated before the callee-saves.
+#
+# CHECK-LABEL: name: save_restore_sve_realign
+# CHECK:      $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -19
+# CHECK-NEXT: STR_PXI killed $p15, $sp, 4
+# CHECK-NEXT: STR_PXI killed $p14, $sp, 5
+# CHECK:      STR_PXI killed $p5, $sp, 14
+# CHECK-NEXT: STR_PXI killed $p4, $sp, 15
+# CHECK-NEXT: STR_ZXI killed $z23, $sp, 2
+# CHECK-NEXT: STR_ZXI killed $z22, $sp, 3
+# CHECK:      STR_ZXI killed $z9, $sp, 16
+# CHECK-NEXT: STR_ZXI killed $z8, $sp, 17
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
+# CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
+
+# CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -19
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+# CHECK-NEXT: RET_ReallyLR
+name: save_restore_sve_realign
+fixedStack:
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 }
+stack:
+  - { id: 0, stack-id: default, size: 16, alignment: 32 }
+body:             |
+  bb.0.entry:
+
+    $z8_z9_z10_z11   = IMPLICIT_DEF
+    $z12_z13_z14_z15 = IMPLICIT_DEF
+    $z16_z17_z18_z19 = IMPLICIT_DEF
+    $z20_z21_z22_z23 = IMPLICIT_DEF
+    $z24_z25_z26_z27 = IMPLICIT_DEF
+    $z28_z29_z30_z31 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    RET_ReallyLR
+---


        


More information about the llvm-commits mailing list