[llvm] f589e50 - [LoongArch] Split SP adjustment
Weining Lu via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 28 01:40:54 PDT 2022
Author: wanglei
Date: 2022-10-28T16:39:00+08:00
New Revision: f589e5067fc8c15d8fc228169055c63ff29b2c14
URL: https://github.com/llvm/llvm-project/commit/f589e5067fc8c15d8fc228169055c63ff29b2c14
DIFF: https://github.com/llvm/llvm-project/commit/f589e5067fc8c15d8fc228169055c63ff29b2c14.diff
LOG: [LoongArch] Split SP adjustment
This patch split the SP adjustment to reduce the instructions in
prologue and epilogue. In this way, the offset of the callee saved
register could fit in a single store.
Similar to D68011(RISCV).
Differential Revision: https://reviews.llvm.org/D136222
Added:
llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
Modified:
llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
llvm/test/CodeGen/LoongArch/stack-realignment.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 45472157b4821..e8985d9282432 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -138,11 +138,17 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
// First, compute final stack size.
uint64_t StackSize = MFI.getStackSize();
+ uint64_t RealStackSize = StackSize;
// Early exit if there is no need to allocate space in the stack.
if (StackSize == 0 && !MFI.adjustsStack())
return;
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+ // Split the SP adjustment to reduce the offsets of callee saved spill.
+ if (FirstSPAdjustAmount)
+ StackSize = FirstSPAdjustAmount;
+
// Adjust stack.
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
// Emit ".cfi_def_cfa_offset StackSize".
@@ -184,7 +190,29 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Emit the second SP adjustment after saving callee saved registers.
+ if (FirstSPAdjustAmount) {
+ uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount;
+ assert(SecondSPAdjustAmount > 0 &&
+ "SecondSPAdjustAmount should be greater than zero");
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
+ MachineInstr::FrameSetup);
+ if (!hasFP(MF)) {
+ // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
+ // don't emit an sp-based .cfi_def_cfa_offset
+ // Emit ".cfi_def_cfa_offset RealStackSize"
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (hasFP(MF)) {
// Realign stack.
if (RI->hasStackRealignment(MF)) {
unsigned ShiftAmount = Log2(MFI.getMaxAlign());
@@ -244,10 +272,47 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
MachineInstr::FrameDestroy);
}
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+ if (FirstSPAdjustAmount) {
+ uint64_t SecondSPAdjustAmount = StackSize - FirstSPAdjustAmount;
+ assert(SecondSPAdjustAmount > 0 &&
+ "SecondSPAdjustAmount should be greater than zero");
+
+ adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
+ MachineInstr::FrameDestroy);
+ StackSize = FirstSPAdjustAmount;
+ }
+
// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
}
+// We would like to split the SP adjustment to reduce prologue/epilogue
+// as following instructions. In this way, the offset of the callee saved
+// register could fit in a single store.
+// e.g.
+// addi.d $sp, $sp, -2032
+// st.d $ra, $sp, 2024
+// st.d $fp, $sp, 2016
+// addi.d $sp, $sp, -16
+uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount(
+ const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+ // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed
+ // 12-bit and there exists a callee-saved register needing to be pushed.
+ if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) {
+ // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will
+ // cause sp = sp + 2048 in the epilogue to be split into multiple
+ // instructions. Offsets smaller than 2048 can fit in a single load/store
+ // instruction, and we have to stick with the stack alignment.
+ // So (2048 - StackAlign) will satisfy the stack alignment.
+ return 2048 - getStackAlign().value();
+ }
+ return 0;
+}
+
void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
@@ -307,6 +372,7 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
auto *LoongArchFI = MF.getInfo<LoongArchMachineFunctionInfo>();
uint64_t StackSize = MFI.getStackSize();
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
@@ -325,7 +391,10 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
if (FI >= MinCSFI && FI <= MaxCSFI) {
FrameReg = LoongArch::R3;
- Offset += StackOffset::getFixed(StackSize);
+ if (FirstSPAdjustAmount)
+ Offset += StackOffset::getFixed(FirstSPAdjustAmount);
+ else
+ Offset += StackOffset::getFixed(StackSize);
} else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
// If the stack was realigned, the frame pointer is set in order to allow
// SP to be restored, so we need another base register to record the stack
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
index e1e3e260f97a2..7ef79aaf32999 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -45,6 +45,8 @@ class LoongArchFrameLowering : public TargetFrameLowering {
bool hasFP(const MachineFunction &MF) const override;
bool hasBP(const MachineFunction &MF) const;
+ uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
+
private:
void determineFrameLayout(MachineFunction &MF) const;
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
new file mode 100644
index 0000000000000..093c92b0dadbb
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+;; The stack size is 2048 and the SP adjustment will be split.
+define i32 @SplitSP() nounwind {
+; CHECK-LABEL: SplitSP:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -2032
+; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $sp, $sp, -16
+; CHECK-NEXT: addi.d $a0, $sp, 12
+; CHECK-NEXT: bl %plt(foo)
+; CHECK-NEXT: move $a0, $zero
+; CHECK-NEXT: addi.d $sp, $sp, 16
+; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 2032
+; CHECK-NEXT: ret
+entry:
+ %xx = alloca [2028 x i8], align 1
+ %0 = getelementptr inbounds [2028 x i8], ptr %xx, i32 0, i32 0
+ %call = call i32 @foo(ptr nonnull %0)
+ ret i32 0
+}
+
+;; The stack size is 2032 and the SP adjustment will not be split.
+define i32 @NoSplitSP() nounwind {
+; CHECK-LABEL: NoSplitSP:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -2032
+; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bl %plt(foo)
+; CHECK-NEXT: move $a0, $zero
+; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 2032
+; CHECK-NEXT: ret
+entry:
+ %xx = alloca [2024 x i8], align 1
+ %0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0
+ %call = call i32 @foo(ptr nonnull %0)
+ ret i32 0
+}
+
+declare i32 @foo(ptr)
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 16c7bcd8b1c5a..89672fbe8b0c2 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -453,46 +453,46 @@ define void @caller_no_realign1024() "no-realign-stack" {
define void @caller2048() {
; LA32-LABEL: caller2048:
; LA32: # %bb.0:
-; LA32-NEXT: addi.w $sp, $sp, -2048
-; LA32-NEXT: .cfi_def_cfa_offset 2048
-; LA32-NEXT: st.w $ra, $sp, 2044 # 4-byte Folded Spill
-; LA32-NEXT: st.w $fp, $sp, 2040 # 4-byte Folded Spill
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
; LA32-NEXT: .cfi_offset 1, -4
; LA32-NEXT: .cfi_offset 22, -8
; LA32-NEXT: addi.w $fp, $sp, 2032
-; LA32-NEXT: addi.w $fp, $fp, 16
; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: addi.w $sp, $sp, -16
; LA32-NEXT: srli.w $a0, $sp, 11
; LA32-NEXT: slli.w $sp, $a0, 11
; LA32-NEXT: addi.w $a0, $sp, 0
; LA32-NEXT: bl %plt(callee)
; LA32-NEXT: addi.w $sp, $fp, -2048
-; LA32-NEXT: ld.w $fp, $sp, 2040 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $ra, $sp, 2044 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: addi.w $sp, $sp, 16
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: ret
;
; LA64-LABEL: caller2048:
; LA64: # %bb.0:
-; LA64-NEXT: addi.d $sp, $sp, -2048
-; LA64-NEXT: .cfi_def_cfa_offset 2048
-; LA64-NEXT: st.d $ra, $sp, 2040 # 8-byte Folded Spill
-; LA64-NEXT: st.d $fp, $sp, 2032 # 8-byte Folded Spill
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
; LA64-NEXT: .cfi_offset 1, -8
; LA64-NEXT: .cfi_offset 22, -16
; LA64-NEXT: addi.d $fp, $sp, 2032
-; LA64-NEXT: addi.d $fp, $fp, 16
; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: addi.d $sp, $sp, -16
; LA64-NEXT: srli.d $a0, $sp, 11
; LA64-NEXT: slli.d $sp, $a0, 11
; LA64-NEXT: addi.d $a0, $sp, 0
; LA64-NEXT: bl %plt(callee)
; LA64-NEXT: addi.d $sp, $fp, -2048
-; LA64-NEXT: ld.d $fp, $sp, 2032 # 8-byte Folded Reload
-; LA64-NEXT: ld.d $ra, $sp, 2040 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: addi.d $sp, $sp, 16
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: ret
%1 = alloca i8, align 2048
call void @callee(i8* %1)
@@ -531,66 +531,52 @@ define void @caller_no_realign2048() "no-realign-stack" {
define void @caller4096() {
; LA32-LABEL: caller4096:
; LA32: # %bb.0:
-; LA32-NEXT: lu12i.w $a0, 1
-; LA32-NEXT: sub.w $sp, $sp, $a0
-; LA32-NEXT: .cfi_def_cfa_offset 4096
-; LA32-NEXT: ori $a0, $zero, 4092
-; LA32-NEXT: add.w $a0, $sp, $a0
-; LA32-NEXT: st.w $ra, $a0, 0 # 4-byte Folded Spill
-; LA32-NEXT: ori $a0, $zero, 4088
-; LA32-NEXT: add.w $a0, $sp, $a0
-; LA32-NEXT: st.w $fp, $a0, 0 # 4-byte Folded Spill
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
; LA32-NEXT: .cfi_offset 1, -4
; LA32-NEXT: .cfi_offset 22, -8
-; LA32-NEXT: lu12i.w $a0, 1
-; LA32-NEXT: add.w $fp, $sp, $a0
+; LA32-NEXT: addi.w $fp, $sp, 2032
; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: addi.w $sp, $sp, -2048
+; LA32-NEXT: addi.w $sp, $sp, -16
; LA32-NEXT: srli.w $a0, $sp, 12
; LA32-NEXT: slli.w $sp, $a0, 12
; LA32-NEXT: addi.w $a0, $sp, 0
; LA32-NEXT: bl %plt(callee)
; LA32-NEXT: lu12i.w $a0, 1
; LA32-NEXT: sub.w $sp, $fp, $a0
-; LA32-NEXT: ori $a0, $zero, 4088
-; LA32-NEXT: add.w $a0, $sp, $a0
-; LA32-NEXT: ld.w $fp, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT: ori $a0, $zero, 4092
-; LA32-NEXT: add.w $a0, $sp, $a0
-; LA32-NEXT: ld.w $ra, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT: lu12i.w $a0, 1
-; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: addi.w $sp, $sp, 32
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: ret
;
; LA64-LABEL: caller4096:
; LA64: # %bb.0:
-; LA64-NEXT: lu12i.w $a0, 1
-; LA64-NEXT: sub.d $sp, $sp, $a0
-; LA64-NEXT: .cfi_def_cfa_offset 4096
-; LA64-NEXT: ori $a0, $zero, 4088
-; LA64-NEXT: add.d $a0, $sp, $a0
-; LA64-NEXT: st.d $ra, $a0, 0 # 8-byte Folded Spill
-; LA64-NEXT: ori $a0, $zero, 4080
-; LA64-NEXT: add.d $a0, $sp, $a0
-; LA64-NEXT: st.d $fp, $a0, 0 # 8-byte Folded Spill
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
; LA64-NEXT: .cfi_offset 1, -8
; LA64-NEXT: .cfi_offset 22, -16
-; LA64-NEXT: lu12i.w $a0, 1
-; LA64-NEXT: add.d $fp, $sp, $a0
+; LA64-NEXT: addi.d $fp, $sp, 2032
; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: addi.d $sp, $sp, -2048
+; LA64-NEXT: addi.d $sp, $sp, -16
; LA64-NEXT: srli.d $a0, $sp, 12
; LA64-NEXT: slli.d $sp, $a0, 12
; LA64-NEXT: addi.d $a0, $sp, 0
; LA64-NEXT: bl %plt(callee)
; LA64-NEXT: lu12i.w $a0, 1
; LA64-NEXT: sub.d $sp, $fp, $a0
-; LA64-NEXT: ori $a0, $zero, 4080
-; LA64-NEXT: add.d $a0, $sp, $a0
-; LA64-NEXT: ld.d $fp, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT: ori $a0, $zero, 4088
-; LA64-NEXT: add.d $a0, $sp, $a0
-; LA64-NEXT: ld.d $ra, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT: lu12i.w $a0, 1
-; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: addi.d $sp, $sp, 32
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: ret
%1 = alloca i8, align 4096
call void @callee(i8* %1)
More information about the llvm-commits
mailing list