[llvm] 17e85cd - [AArch64] Async unwind - Always place the first LDP at the end when ReverseCSRRestoreSeq is true

Thu Feb 24 10:57:24 PST 2022

Author: Momchil Velikov
Date: 2022-02-24T18:48:07Z
New Revision: 17e85cd4109ca778a1b0f1a6521b16112ea1e766

URL: https://github.com/llvm/llvm-project/commit/17e85cd4109ca778a1b0f1a6521b16112ea1e766
DIFF: https://github.com/llvm/llvm-project/commit/17e85cd4109ca778a1b0f1a6521b16112ea1e766.diff

LOG: [AArch64] Async unwind - Always place the first LDP at the end when ReverseCSRRestoreSeq is true

This patch is in preparation for the async unwind CFI.

Put the first `LDP` the end, so that the load-store optimizer can run
and merge the `LDP` and the `ADD` into a post-index `LDP`.

Do this always and as early as at the time of the initial creation of
the CSR restore instructions, even if that `LDP` is not guaranteed to
be mergeable with a subsequent `SP` increment.

This greatly simplifies the CFI generation for prologue, as otherwise
we have to take extra steps to ensure reordering does not cross CFI
instructions.

Reviewed By: danielkiss

Differential Revision: https://reviews.llvm.org/D112328

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 93daf02731801..39d8ca9045c85 100644

--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1049,38 +1049,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   }
 }
 
-static void adaptForLdStOpt(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator FirstSPPopI,
-                            MachineBasicBlock::iterator LastPopI) {
-  // Sometimes (when we restore in the same order as we save), we can end up
-  // with code like this:
-  //
-  // ldp      x26, x25, [sp]
-  // ldp      x24, x23, [sp, #16]
-  // ldp      x22, x21, [sp, #32]
-  // ldp      x20, x19, [sp, #48]
-  // add      sp, sp, #64
-  //
-  // In this case, it is always better to put the first ldp at the end, so
-  // that the load-store optimizer can run and merge the ldp and the add into
-  // a post-index ldp.
-  // If we managed to grab the first pop instruction, move it to the end.
-  if (ReverseCSRRestoreSeq)
-    MBB.splice(FirstSPPopI, &MBB, LastPopI);
-  // We should end up with something like this now:
-  //
-  // ldp      x24, x23, [sp, #16]
-  // ldp      x22, x21, [sp, #32]
-  // ldp      x20, x19, [sp, #48]
-  // ldp      x26, x25, [sp]
-  // add      sp, sp, #64
-  //
-  // and the load-store optimizer can merge the last two instructions into:
-  //
-  // ldp      x26, x25, [sp], #64
-  //
-}
-
 static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
@@ -1911,18 +1879,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     if (NoCalleeSaveRestore)
       StackRestoreBytes += AfterCSRPopSize;
 
-    // If we were able to combine the local stack pop with the argument pop,
-    // then we're done.
-    bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
-
-    // If we're done after this, make sure to help the load store optimizer.
-    if (Done)
-      adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
-
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(StackRestoreBytes), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-    if (Done) {
+    // If we were able to combine the local stack pop with the argument pop,
+    // then we're done.
+    if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
       if (HasWinCFI) {
         BuildMI(MBB, MBB.getFirstTerminator(), DL,
                 TII->get(AArch64::SEH_EpilogEnd))
@@ -1966,8 +1928,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       FirstSPPopI = Prev;
     }
 
-    adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
-
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(AfterCSRPopSize), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
@@ -2624,7 +2584,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 }
 
 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -2632,14 +2592,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   SmallVector<RegPairInfo, 8> RegPairs;
   bool NeedsWinCFI = needsWinCFI(MF);
 
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
 
   bool NeedShadowCallStackProlog = false;
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
                                  NeedShadowCallStackProlog, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) {
+  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -2696,7 +2656,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
@@ -2713,6 +2673,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
         MachineMemOperand::MOLoad, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+    return MIB->getIterator();
   };
 
   // SVE objects are always restored in reverse order.
@@ -2720,26 +2681,38 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     if (RPI.isScalable())
       EmitMI(RPI);
 
-  if (ReverseCSRRestoreSeq) {
-    for (const RegPairInfo &RPI : reverse(RegPairs))
-      if (!RPI.isScalable())
-        EmitMI(RPI);
-  } else if (homogeneousPrologEpilog(MF, &MBB)) {
-    auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+  if (homogeneousPrologEpilog(MF, &MBB)) {
+    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
                    .setMIFlag(MachineInstr::FrameDestroy);
     for (auto &RPI : RegPairs) {
       MIB.addReg(RPI.Reg1, RegState::Define);
       MIB.addReg(RPI.Reg2, RegState::Define);
     }
     return true;
-  } else
-    for (const RegPairInfo &RPI : RegPairs)
-      if (!RPI.isScalable())
-        EmitMI(RPI);
+  }
+
+  if (ReverseCSRRestoreSeq) {
+    MachineBasicBlock::iterator First = MBB.end();
+    for (const RegPairInfo &RPI : reverse(RegPairs)) {
+      if (RPI.isScalable())
+        continue;
+      MachineBasicBlock::iterator It = EmitMI(RPI);
+      if (First == MBB.end())
+        First = It;
+    }
+    if (First != MBB.end())
+      MBB.splice(MBBI, &MBB, First);
+  } else {
+    for (const RegPairInfo &RPI : RegPairs) {
+      if (RPI.isScalable())
+        continue;
+      (void)EmitMI(RPI);
+    }
+  }
 
   if (NeedShadowCallStackProlog) {
     // Shadow call stack epilog: ldr x30, [x18, #-8]!
-    BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
         .addReg(AArch64::X18, RegState::Define)
         .addReg(AArch64::LR, RegState::Define)
         .addReg(AArch64::X18)

diff  --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
index a2c9a25da24c2..de4baec50e0c6 100644
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
@@ -1,5 +1,5 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT
+# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
+# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
 #
 --- |
 
@@ -31,13 +31,9 @@ body:             |
   ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
   ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
 
-  ; Before running the load-store optimizer, we emit a ldp and an add.
-  ; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0
-  ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0
-
-  ; We want to make sure that after running the load-store optimizer, the ldp
-  ; and the add get merged into a post-index ldp.
-  ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
+  ; The ldp and the stack increment get merged even before
+  ; the load-store optimizer.
+  ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
 
     RET_ReallyLR
 ...
@@ -66,10 +62,12 @@ body:             |
   ; the local stack size. This results in rewriting the offsets for all the
   ; save/restores and forbids us to merge the stack adjustment and the last pop.
   ; In this case, there is no point of moving the first CSR pair at the end.
-  ; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4
+  ; We do it anyway, as it's a small price to pay for the resulting
+  ; simplification in the epilogue emission code.
+  ; CHECK:      $x24, $x23 = frame-destroy LDPXi $sp, 4
   ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
   ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
+  ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
   ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
     RET_ReallyLR
 ...
@@ -98,9 +96,6 @@ body:             |
 
   bb.1:
    ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
-   ; BEFORELDSTOPT-NEXT: $lr = frame-destroy LDRXui $sp, 0
-   ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
-
-   ; AFTERLDSTOPT-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
+   ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
     RET_ReallyLR
 ...