[compiler-rt] [flang] [clang] [clang-tools-extra] [llvm] [PowerPC][CodeGen] Exploit STMW and LMW in 32-bit big-endian mode. (PR #74415)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Dec 5 00:45:18 PST 2023
https://github.com/EsmeYi updated https://github.com/llvm/llvm-project/pull/74415
>From f6d0ef8357540c61a9c20774e3b170a8db5b72ca Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi at ibm.com>
Date: Tue, 5 Dec 2023 00:44:04 -0500
Subject: [PATCH 1/2] Exploit STMW and LMW in 32-bit big-endian mode.
---
llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 76 ++++++++++++-
llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 4 +
llvm/test/CodeGen/PowerPC/stm-lm-merge.ll | 110 +++++++++++++++++++
3 files changed, 188 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index eb3bf3b2690b2..4d4ef6251a999 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -40,6 +40,12 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills",
cl::desc("Enable spills in prologue to vector registers."),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ EnableLoadStoreMultiple("ppc-enable-load-store-multiple",
+ cl::desc("Enable load/store multiple (only "
+ "support in 32-bit big-endian mode)."),
+ cl::init(false), cl::Hidden);
+
static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
if (STI.isAIXABI())
return STI.isPPC64() ? 16 : 8;
@@ -2407,6 +2413,30 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
return AllSpilledToReg;
}
+static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI,
+ Register &MergeFrom) {
+ CalleeSavedInfo BeginI = CSI[0];
+ unsigned I = 1, E = CSI.size();
+ for (; I < E; ++I) {
+ // Find continuous store/load.
+ unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
+ unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
+ Register BeginReg = BeginI.getReg();
+ if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() ||
+ RegDiff != 1 || FrameIdxDiff != 1)
+ BeginI = CSI[I];
+ if (CSI[I].getReg() == PPC::R31)
+ break;
+ }
+
+ if (I == E || BeginI.getReg() == PPC::R31)
+ return;
+
+ // Record the first reg that STMW/LMW are going to merge since STMW/LMW save
+ // from rN to r31.
+ MergeFrom = BeginI.getReg();
+}
+
bool PPCFrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
@@ -2437,6 +2467,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
}
}
+ Register MergeFrom = PPC::R31;
+ if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
+ !Subtarget.isPPC64())
+ findContinuousLoadStore(CSI, MergeFrom);
+
for (const CalleeSavedInfo &I : CSI) {
Register Reg = I.getReg();
@@ -2521,7 +2556,23 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
!MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
I.getFrameIdx(), RC, TRI);
- else
+ else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
+ // Build an STMW instruction.
+ int FrameIdx = I.getFrameIdx();
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::STMW));
+ MIB.addReg(Reg, getKillRegState(!IsLiveIn));
+ // Add frame reference.
+ MIB.addImm(0).addFrameIndex(FrameIdx);
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIdx),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlign(FrameIdx));
+ MIB.addMemOperand(MMO);
+ } else if (Reg > MergeFrom && Reg <= PPC::R31)
+ continue;
+ else
TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
TRI, Register());
}
@@ -2615,6 +2666,11 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
unsigned CSIIndex = 0;
BitVector Restored(TRI->getNumRegs());
+ Register MergeFrom = PPC::R31;
+ if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
+ !Subtarget.isPPC64())
+ findContinuousLoadStore(CSI, MergeFrom);
+
// Initialize insertion-point logic; we will be restoring in reverse
// order of spill.
MachineBasicBlock::iterator I = MI, BeforeI = I;
@@ -2694,7 +2750,23 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
!MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
TRI);
- else
+ else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
+ // Build an LMW instruction.
+ int FrameIdx = CSI[i].getFrameIdx();
+ DebugLoc DL;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::LMW), Reg);
+ // Add frame reference.
+ MIB.addImm(0).addFrameIndex(FrameIdx);
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIdx),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlign(FrameIdx));
+ MIB.addMemOperand(MMO);
+ } else if (Reg > MergeFrom && Reg <= PPC::R31)
+ continue;
+ else
TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
Register());
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 7d913a77cc715..6f5cc5e550e0f 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1676,6 +1676,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
+ // STMW and LMW only have immediate form.
+ if (OpC == PPC::STMW || OpC == PPC::LMW)
+ noImmForm = false;
+
// Now add the frame object offset to the offset from r1.
int64_t Offset = MFI.getObjectOffset(FrameIndex);
Offset += MI.getOperand(OffsetOperandNo).getImm();
diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
new file mode 100644
index 0000000000000..f486317f758a4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple=powerpc-unknown-aix-xcoff -verify-machineinstrs \
+; RUN: -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \
+; RUN: | FileCheck %s
+
+; CHECK: stmw 16, 64(1) # 4-byte Folded Spill
+; CHECK: lmw 16, 64(1) # 4-byte Folded Reload
+
+ at a = external local_unnamed_addr global i32, align 4
+ at b = external local_unnamed_addr global i32, align 4
+ at f = external local_unnamed_addr global i32, align 4
+ at c = external local_unnamed_addr global i32, align 4
+ at g = external local_unnamed_addr global i32, align 4
+ at e = external local_unnamed_addr global i32, align 4
+ at h = external local_unnamed_addr global i32, align 4
+ at d = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind
+define i32 @foo(ptr noundef %b1, ptr noundef %b2, i32 noundef %count) local_unnamed_addr #0 {
+entry:
+ %invariant.gep = getelementptr i32, ptr %b2, i32 -1
+ %cmp63 = icmp sgt i32 %count, 0
+ br i1 %cmp63, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %sw.epilog, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add29, %sw.epilog ]
+ %0 = load i32, ptr @a, align 4
+ %add30 = add nsw i32 %0, %sum.0.lcssa
+ %1 = load i32, ptr @b, align 4
+ %add31 = add nsw i32 %add30, %1
+ %2 = load i32, ptr @c, align 4
+ %add32 = add nsw i32 %add31, %2
+ %3 = load i32, ptr @d, align 4
+ %add33 = add nsw i32 %add32, %3
+ %4 = load i32, ptr @e, align 4
+ %add34 = add nsw i32 %add33, %4
+ ret i32 %add34
+
+for.body: ; preds = %entry, %sw.epilog
+ %i.065 = phi i32 [ %inc, %sw.epilog ], [ 0, %entry ]
+ %sum.064 = phi i32 [ %add29, %sw.epilog ], [ 0, %entry ]
+ tail call void @foo1(ptr noundef %b1, ptr noundef %b2) #2
+ %rem = and i32 %i.065, 3
+ switch i32 %rem, label %for.body.unreachabledefault [
+ i32 0, label %sw.bb
+ i32 1, label %sw.bb4
+ i32 2, label %sw.bb11
+ i32 3, label %sw.bb19
+ ]
+
+sw.bb: ; preds = %for.body
+ %gep62 = getelementptr i32, ptr %invariant.gep, i32 %i.065
+ %5 = load i32, ptr %gep62, align 4
+ %6 = load i32, ptr @a, align 4
+ %7 = load i32, ptr @b, align 4
+ %reass.add = add i32 %7, %5
+ %reass.mul = mul i32 %reass.add, 3
+ %add2 = add i32 %reass.mul, %6
+ br label %sw.epilog
+
+sw.bb4: ; preds = %for.body
+ %arrayidx5 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+ %8 = load i32, ptr %arrayidx5, align 4
+ %mul6 = mul nsw i32 %8, 5
+ %9 = load i32, ptr @b, align 4
+ %10 = load i32, ptr @f, align 4
+ %11 = shl i32 %10, 2
+ %12 = add i32 %9, %11
+ %sub9 = sub i32 %mul6, %12
+ br label %sw.epilog
+
+sw.bb11: ; preds = %for.body
+ %gep = getelementptr i32, ptr %invariant.gep, i32 %i.065
+ %13 = load i32, ptr %gep, align 4
+ %mul14 = shl nsw i32 %13, 2
+ %14 = load i32, ptr @c, align 4
+ %mul15 = mul nsw i32 %mul14, %14
+ %15 = load i32, ptr @g, align 4
+ %mul16 = mul nsw i32 %15, 5
+ %add17 = add nsw i32 %mul16, %mul15
+ br label %sw.epilog
+
+sw.bb19: ; preds = %for.body
+ %arrayidx20 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+ %16 = load i32, ptr %arrayidx20, align 4
+ %mul21 = mul nsw i32 %16, 6
+ %17 = load i32, ptr @e, align 4
+ %div = sdiv i32 %mul21, %17
+ %div22 = udiv i32 6, %i.065
+ %add23 = add nsw i32 %div22, %div
+ br label %sw.epilog
+
+for.body.unreachabledefault: ; preds = %for.body
+ unreachable
+
+sw.epilog: ; preds = %sw.bb19, %sw.bb11, %sw.bb4, %sw.bb
+ %add23.sink = phi i32 [ %add23, %sw.bb19 ], [ %add17, %sw.bb11 ], [ %sub9, %sw.bb4 ], [ %add2, %sw.bb ]
+ %arrayidx24 = getelementptr inbounds i32, ptr %b1, i32 %i.065
+ store i32 %add23.sink, ptr %arrayidx24, align 4
+ %arrayidx26 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+ %18 = load i32, ptr %arrayidx26, align 4
+ %19 = load i32, ptr @h, align 4
+ %add27 = add i32 %add23.sink, %sum.064
+ %add28 = add i32 %add27, %18
+ %add29 = add i32 %add28, %19
+ %inc = add nuw nsw i32 %i.065, 1
+ %exitcond.not = icmp eq i32 %inc, %count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare void @foo1(ptr noundef, ptr noundef) local_unnamed_addr #1
>From fcab87aa7c93cde06ffa5cf4e722896e370aa3f5 Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi at ibm.com>
Date: Tue, 5 Dec 2023 03:44:06 -0500
Subject: [PATCH 2/2] Address ecnelises's comments.
---
llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 35 +++++++++++---------
llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 8 ++---
llvm/test/CodeGen/PowerPC/stm-lm-merge.ll | 32 ++++++++++++++++--
3 files changed, 52 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index fdd1c6d508638..705dd9140c1b0 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2407,26 +2407,25 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI,
Register &MergeFrom) {
- CalleeSavedInfo BeginI = CSI[0];
- unsigned I = 1, E = CSI.size();
+ unsigned I = 1, E = CSI.size(), BeginI = 0;
for (; I < E; ++I) {
// Find continuous store/load.
- unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
- unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
- Register BeginReg = BeginI.getReg();
- if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() ||
- RegDiff != 1 || FrameIdxDiff != 1)
- BeginI = CSI[I];
+ int RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
+ int FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
+ Register BeginReg = CSI[BeginI].getReg();
+ if (BeginReg < PPC::R0 || BeginReg > PPC::R31 ||
+ CSI[BeginI].isSpilledToReg() || RegDiff != 1 || FrameIdxDiff != 1)
+ BeginI = I;
if (CSI[I].getReg() == PPC::R31)
break;
}
- if (I == E || BeginI.getReg() == PPC::R31)
+ if (I == E || CSI[BeginI].getReg() >= PPC::R31)
return;
// Record the first reg that STMW/LMW are going to merge since STMW/LMW save
// from rN to r31.
- MergeFrom = BeginI.getReg();
+ MergeFrom = CSI[BeginI].getReg();
}
bool PPCFrameLowering::spillCalleeSavedRegisters(
@@ -2459,6 +2458,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
}
}
+ // STMW save from rN to r31, where rN < r31. MergeFrom will be less than
+ // r31 if continuous store are found.
Register MergeFrom = PPC::R31;
if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
!Subtarget.isPPC64())
@@ -2546,8 +2547,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
// saved vector registers.
if (Subtarget.needsSwapsForVSXMemOps() &&
!MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
- TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
- I.getFrameIdx(), RC, TRI);
+ TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(),
+ RC, TRI);
else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
// Build an STMW instruction.
int FrameIdx = I.getFrameIdx();
@@ -2564,7 +2565,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
MIB.addMemOperand(MMO);
} else if (Reg > MergeFrom && Reg <= PPC::R31)
continue;
- else
+ else
TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
TRI, Register());
}
@@ -2658,6 +2659,8 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
unsigned CSIIndex = 0;
BitVector Restored(TRI->getNumRegs());
+ // LMW save from rN to r31, where rN < r31. MergeFrom will be less than
+ // r31 if continuous load are found.
Register MergeFrom = PPC::R31;
if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
!Subtarget.isPPC64())
@@ -2756,13 +2759,13 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
MFI.getObjectAlign(FrameIdx));
MIB.addMemOperand(MMO);
- } else if (Reg > MergeFrom && Reg <= PPC::R31)
+ } else if (Reg > MergeFrom && Reg < PPC::R31)
continue;
- else
+ else
TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
Register());
- assert(I != MBB.begin() &&
+ assert(I != MBB.begin() &&
"loadRegFromStackSlot didn't insert any code!");
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6f5cc5e550e0f..f582f2d35b495 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1673,12 +1673,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If the instruction is not present in ImmToIdxMap, then it has no immediate
// form (and must be r+r).
- bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
- OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
-
// STMW and LMW only have immediate form.
- if (OpC == PPC::STMW || OpC == PPC::LMW)
- noImmForm = false;
+ bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
+ OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC) &&
+ OpC != PPC::STMW && OpC != PPC::LMW;
// Now add the frame object offset to the offset from r1.
int64_t Offset = MFI.getObjectOffset(FrameIndex);
diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
index f486317f758a4..94a96c7dd0a5e 100644
--- a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
+++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
@@ -2,8 +2,36 @@
; RUN: -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \
; RUN: | FileCheck %s
-; CHECK: stmw 16, 64(1) # 4-byte Folded Spill
-; CHECK: lmw 16, 64(1) # 4-byte Folded Reload
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mflr 0
+; CHECK-NEXT: stwu 1, -128(1)
+; CHECK-NEXT: cmpwi 5, 0
+; CHECK-NEXT: stw 0, 136(1)
+; CHECK-NEXT: stmw 16, 64(1) # 4-byte Folded Spill
+; CHECK-NEXT: ble 0, L..BB0_11
+
+; CHECK: L..BB0_12: # %for.cond.cleanup
+; CHECK-NEXT: lwz 3, L..C0(2) # @a
+; CHECK-NEXT: lwz 4, L..C1(2) # @b
+; CHECK-NEXT: lwz 5, L..C4(2) # @c
+; CHECK-NEXT: lwz 6, L..C7(2) # @d
+; CHECK-NEXT: lwz 7, L..C6(2) # @e
+; CHECK-NEXT: lmw 16, 64(1) # 4-byte Folded Reload
+; CHECK-NEXT: lwz 3, 0(3)
+; CHECK-NEXT: lwz 4, 0(4)
+; CHECK-NEXT: add 3, 3, 28
+; CHECK-NEXT: lwz 5, 0(5)
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: lwz 6, 0(6)
+; CHECK-NEXT: add 3, 3, 5
+; CHECK-NEXT: lwz 4, 0(7)
+; CHECK-NEXT: add 3, 3, 6
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: lwz 31, 124(1) # 4-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 128
+; CHECK-NEXT: lwz 0, 8(1)
+; CHECK-NEXT: mtlr 0
+; CHECK-NEXT: bl
@a = external local_unnamed_addr global i32, align 4
@b = external local_unnamed_addr global i32, align 4
More information about the cfe-commits
mailing list