[llvm] 17e85cd - [AArch64] Async unwind - Always place the first LDP at the end when ReverseCSRRestoreSeq is true
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 24 10:57:24 PST 2022
Author: Momchil Velikov
Date: 2022-02-24T18:48:07Z
New Revision: 17e85cd4109ca778a1b0f1a6521b16112ea1e766
URL: https://github.com/llvm/llvm-project/commit/17e85cd4109ca778a1b0f1a6521b16112ea1e766
DIFF: https://github.com/llvm/llvm-project/commit/17e85cd4109ca778a1b0f1a6521b16112ea1e766.diff
LOG: [AArch64] Async unwind - Always place the first LDP at the end when ReverseCSRRestoreSeq is true
This patch is in preparation for the async unwind CFI.
Put the first `LDP` the end, so that the load-store optimizer can run
and merge the `LDP` and the `ADD` into a post-index `LDP`.
Do this always and as early as at the time of the initial creation of
the CSR restore instructions, even if that `LDP` is not guaranteed to
be mergeable with a subsequent `SP` increment.
This greatly simplifies the CFI generation for prologue, as otherwise
we have to take extra steps to ensure reordering does not cross CFI
instructions.
Reviewed By: danielkiss
Differential Revision: https://reviews.llvm.org/D112328
Added:
Modified:
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 93daf02731801..39d8ca9045c85 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1049,38 +1049,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
}
}
-static void adaptForLdStOpt(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator FirstSPPopI,
- MachineBasicBlock::iterator LastPopI) {
- // Sometimes (when we restore in the same order as we save), we can end up
- // with code like this:
- //
- // ldp x26, x25, [sp]
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // add sp, sp, #64
- //
- // In this case, it is always better to put the first ldp at the end, so
- // that the load-store optimizer can run and merge the ldp and the add into
- // a post-index ldp.
- // If we managed to grab the first pop instruction, move it to the end.
- if (ReverseCSRRestoreSeq)
- MBB.splice(FirstSPPopI, &MBB, LastPopI);
- // We should end up with something like this now:
- //
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // ldp x26, x25, [sp]
- // add sp, sp, #64
- //
- // and the load-store optimizer can merge the last two instructions into:
- //
- // ldp x26, x25, [sp], #64
- //
-}
-
static bool isTargetWindows(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
}
@@ -1911,18 +1879,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
if (NoCalleeSaveRestore)
StackRestoreBytes += AfterCSRPopSize;
- // If we were able to combine the local stack pop with the argument pop,
- // then we're done.
- bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
-
- // If we're done after this, make sure to help the load store optimizer.
- if (Done)
- adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
-
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(StackRestoreBytes), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
- if (Done) {
+ // If we were able to combine the local stack pop with the argument pop,
+ // then we're done.
+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
if (HasWinCFI) {
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
@@ -1966,8 +1928,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
FirstSPPopI = Prev;
}
- adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
-
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AfterCSRPopSize), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
@@ -2624,7 +2584,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
bool AArch64FrameLowering::restoreCalleeSavedRegisters(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -2632,14 +2592,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
SmallVector<RegPairInfo, 8> RegPairs;
bool NeedsWinCFI = needsWinCFI(MF);
- if (MI != MBB.end())
- DL = MI->getDebugLoc();
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
NeedShadowCallStackProlog, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) {
+ auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -2696,7 +2656,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
MIB.addReg(Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
@@ -2713,6 +2673,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+ return MIB->getIterator();
};
// SVE objects are always restored in reverse order.
@@ -2720,26 +2681,38 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
if (RPI.isScalable())
EmitMI(RPI);
- if (ReverseCSRRestoreSeq) {
- for (const RegPairInfo &RPI : reverse(RegPairs))
- if (!RPI.isScalable())
- EmitMI(RPI);
- } else if (homogeneousPrologEpilog(MF, &MBB)) {
- auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+ if (homogeneousPrologEpilog(MF, &MBB)) {
+ auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
for (auto &RPI : RegPairs) {
MIB.addReg(RPI.Reg1, RegState::Define);
MIB.addReg(RPI.Reg2, RegState::Define);
}
return true;
- } else
- for (const RegPairInfo &RPI : RegPairs)
- if (!RPI.isScalable())
- EmitMI(RPI);
+ }
+
+ if (ReverseCSRRestoreSeq) {
+ MachineBasicBlock::iterator First = MBB.end();
+ for (const RegPairInfo &RPI : reverse(RegPairs)) {
+ if (RPI.isScalable())
+ continue;
+ MachineBasicBlock::iterator It = EmitMI(RPI);
+ if (First == MBB.end())
+ First = It;
+ }
+ if (First != MBB.end())
+ MBB.splice(MBBI, &MBB, First);
+ } else {
+ for (const RegPairInfo &RPI : RegPairs) {
+ if (RPI.isScalable())
+ continue;
+ (void)EmitMI(RPI);
+ }
+ }
if (NeedShadowCallStackProlog) {
// Shadow call stack epilog: ldr x30, [x18, #-8]!
- BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
.addReg(AArch64::X18, RegState::Define)
.addReg(AArch64::LR, RegState::Define)
.addReg(AArch64::X18)
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
index a2c9a25da24c2..de4baec50e0c6 100644
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
@@ -1,5 +1,5 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT
+# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
+# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
#
--- |
@@ -31,13 +31,9 @@ body: |
; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
- ; Before running the load-store optimizer, we emit a ldp and an add.
- ; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0
- ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0
-
- ; We want to make sure that after running the load-store optimizer, the ldp
- ; and the add get merged into a post-index ldp.
- ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
+ ; The ldp and the stack increment get merged even before
+ ; the load-store optimizer.
+ ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
RET_ReallyLR
...
@@ -66,10 +62,12 @@ body: |
; the local stack size. This results in rewriting the offsets for all the
; save/restores and forbids us to merge the stack adjustment and the last pop.
; In this case, there is no point of moving the first CSR pair at the end.
- ; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2
- ; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4
+ ; We do it anyway, as it's a small price to pay for the resulting
+ ; simplification in the epilogue emission code.
+ ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 4
; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
+ ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
RET_ReallyLR
...
@@ -98,9 +96,6 @@ body: |
bb.1:
; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
- ; BEFORELDSTOPT-NEXT: $lr = frame-destroy LDRXui $sp, 0
- ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
-
- ; AFTERLDSTOPT-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
RET_ReallyLR
...
More information about the llvm-commits
mailing list