[llvm] [AArch64] Optimization of repeated constant loads (#51483) (PR #86249)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 21 22:59:13 PDT 2024
https://github.com/ParkHanbum created https://github.com/llvm/llvm-project/pull/86249
This change looks for cases where load same constant in the upper and lower 32-bit sizes of a 64-bit space. If we found, it loads only the lower 32-bit constant and replaces it with an instruction that stores it simultaneously in the upper and lower.
For example:
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = MOVKXi $x8, 49370, 32
renamable $x8 = MOVKXi $x8, 320, 48
STRXui killed renamable $x8, killed renamable $x0, 0
becomes
$w8 = MOVZWi 49370, 0
$w8 = MOVKWi $w8, 320, 16
STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
>From e1cf253f6875e1e43eacdd8229a1ce1509ad2218 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Fri, 22 Mar 2024 14:31:19 +0900
Subject: [PATCH] [AArch64] Optimization of repeated constant loads (#51483)
This change looks for cases where load same constant in the upper and
lower 32-bit sizes of a 64-bit space. If we found, it loads only the
lower 32-bit constant and replaces it with an instruction that stores
it simultaneously in the upper and lower.
For example:
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = MOVKXi $x8, 49370, 32
renamable $x8 = MOVKXi $x8, 320, 48
STRXui killed renamable $x8, killed renamable $x0, 0
becomes
$w8 = MOVZWi 49370, 0
$w8 = MOVKWi $w8, 320, 16
STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
---
.../AArch64/AArch64LoadStoreOptimizer.cpp | 148 ++++++++++++++++++
1 file changed, 148 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 926a89466255ca..0b7dc874f614fc 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -199,6 +199,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+ // Finds and collapses loads of repeated constant values.
+ bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
+ unsigned Limit);
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated);
+
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2250,6 +2257,126 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+static unsigned getMovValueOrder(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return 0;
+ case AArch64::MOVNXi:
+ case AArch64::MOVZXi:
+ return 1;
+ case AArch64::MOVKXi:
+ return 2;
+ }
+
+ return 0;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+ MachineBasicBlock::iterator FirstMovI;
+ MachineBasicBlock *MBB = MI.getParent();
+ uint64_t Mask = 0xFFFFUL;
+ int Index = 0;
+
+ for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+ if (Index == SuccIndex - 1) {
+ FirstMovI = *MI;
+ break;
+ }
+ (*MI)->eraseFromParent();
+ }
+
+ Register DstRegW =
+ TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
+ DstRegW)
+ .addImm(Accumulated & Mask)
+ .addImm(0);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
+ DstRegW)
+ .addUse(DstRegW)
+ .addImm((Accumulated >> 16) & Mask)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+ FirstMovI->eraseFromParent();
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+ DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+ unsigned DstRegState = getRegState(MI.getOperand(0));
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+ .addReg(DstRegW, DstRegState)
+ .addReg(DstRegW, DstRegState)
+ .addReg(MO.getReg(), getRegState(MO))
+ .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+ .setMemRefs(MI.memoperands())
+ .setMIFlags(MI.getFlags());
+ I->eraseFromParent();
+
+ return NextI;
+}
+
+bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
+ MachineBasicBlock::iterator &I, unsigned Limit) {
+ MachineInstr &MI = *I;
+ if (MI.getOpcode() != AArch64::STRXui)
+ return false;
+
+ MachineBasicBlock::iterator MBBI = I;
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ if (MBBI == B)
+ return false;
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ unsigned Count = 0, SuccIndex = 0;
+ uint64_t Accumulated = 0;
+ SmallVector<MachineBasicBlock::iterator> MIs;
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
+
+ do {
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MI = *MBBI;
+ if (!MI.isTransient())
+ ++Count;
+ unsigned CurrOpc = MI.getOpcode();
+ unsigned ValueOrder = getMovValueOrder(CurrOpc);
+ if (!ValueOrder) {
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+ TRI);
+ continue;
+ }
+
+ MachineOperand Value = MI.getOperand(ValueOrder);
+ MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+ if (!Value.isImm() || !Shift.isImm())
+ return false;
+
+ uint64_t IValue = Value.getImm();
+ uint64_t IShift = Shift.getImm();
+ uint64_t mask = 0xFFFFUL;
+ Accumulated -= (Accumulated & (mask << IShift));
+ Accumulated += (IValue << IShift);
+ MIs.push_back(MBBI);
+
+ if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
+ SuccIndex = Count;
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
+ break;
+ } while (MBBI != B && Count < Limit);
+
+ if (SuccIndex) {
+ I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2512,6 +2639,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
+ // us to load a 32-bit value only once.
+ // Considering :
+ // mov x8, 49370
+ // movk x8, 320, lsl #16
+ // movk x8, 49370, lsl #32
+ // movk x8, 320, lsl #48
+ // str x8, [x0]
+ // Transform :
+ // mov w8, 49370
+ // movk w8, 320, lsl #16
+ // stp w8, w8, [x0]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+
return Modified;
}
More information about the llvm-commits
mailing list