[llvm] [AArch64] Optimization of repeated constant loads (#51483) (PR #86249)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 21 03:43:43 PDT 2024
https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/86249
>From 16bb2538c1cc77053db4eb544e342216e35ec67e Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Sun, 21 Apr 2024 18:41:21 +0900
Subject: [PATCH 1/4] [AArch64] Add PreTest for optimizing `MOV` to `ORR`
---
.../CodeGen/AArch64/movimm-expand-ldst.mir | 35 +++++++++++++++++++
1 file changed, 35 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
new file mode 100644
index 00000000000000..de14437108c93f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
+---
+name: test_fold_repeating_constant_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_fold_repeating_constant_load
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $x0 = MOVZXi 49370, 0
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 16
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 32
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 48
+ ; CHECK-NEXT: RET undef $lr, implicit $x0
+ renamable $x0 = MOVi64imm 90284035103834330
+ RET_ReallyLR implicit $x0
+...
+---
+name: test_fold_repeating_constant_load_neg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_fold_repeating_constant_load_neg
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $x0 = MOVZXi 320, 0
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 16
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 32
+ ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 48
+ ; CHECK-NEXT: RET undef $lr, implicit $x0
+ renamable $x0 = MOVi64imm -4550323095879417536
+ RET_ReallyLR implicit $x0
>From 1d03a1b0648a9e5b0cce6d7ddf6296edf1ac9b8e Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Fri, 12 Apr 2024 16:53:37 +0900
Subject: [PATCH 2/4] [AArch64] Optimize `MOV` to `ORR` when load symmetric
constants (#51483)
This change looks for cases of symmetric constant loading.
`symmetric constant load` is when the upper 32 bits and lower 32 bits
of a 64-bit register load the same value.
When it finds this, it replaces it with an instruction that loads only
the lower 32 bits of the constant and stores it in the upper and lower
bits simultaneously.
For example:
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = MOVKXi $x8, 49370, 32
renamable $x8 = MOVKXi $x8, 320, 48
becomes
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = ORRXrs $x8, $x8, 32
---
llvm/lib/Target/AArch64/AArch64ExpandImm.cpp | 10 ++++++++++
.../lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 13 +++++++++++++
llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir | 6 ++----
3 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index a7d72b59b1d5a6..2d37d232b8edd9 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -509,6 +509,16 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
Imm = ~Imm;
unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+ Shift += 16;
+ Imm16 = (Imm >> Shift) & Mask;
+ if (Imm16 != (isNeg ? Mask : 0))
+ Insn.push_back(
+ {Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)});
+ if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) {
+ Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32});
+ return;
+ }
+
while (Shift < LastShift) {
Shift += 16;
Imm16 = (Imm >> Shift) & Mask;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 03f0778bae59d5..36957bb0f5a059 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
.addImm(I->Op2));
}
break;
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs: {
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MIBS.push_back(
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+ .addReg(DstReg, RegState::Define |
+ getDeadRegState(DstIsDead && LastItem) |
+ RenamableState)
+ .addReg(DstReg)
+ .addReg(DstReg)
+ .addImm(I->Op2));
+ } break;
case AArch64::ANDXri:
case AArch64::EORXri:
if (I->Op1 == 0) {
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index de14437108c93f..1ec2a00f67690b 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -11,8 +11,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $x0 = MOVZXi 49370, 0
; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 16
- ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 32
- ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 48
+ ; CHECK-NEXT: renamable $x0 = ORRXrs $x0, $x0, 32
; CHECK-NEXT: RET undef $lr, implicit $x0
renamable $x0 = MOVi64imm 90284035103834330
RET_ReallyLR implicit $x0
@@ -28,8 +27,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $x0 = MOVZXi 320, 0
; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 16
- ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 32
- ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 48
+ ; CHECK-NEXT: renamable $x0 = ORRXrs $x0, $x0, 32
; CHECK-NEXT: RET undef $lr, implicit $x0
renamable $x0 = MOVi64imm -4550323095879417536
RET_ReallyLR implicit $x0
>From a187e3bf45f0d3a8a1f2c13d2ee6cb3c8bc07b21 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Sun, 21 Apr 2024 19:20:26 +0900
Subject: [PATCH 3/4] [AArch64] Add PreTest for optimizing `MOV` to `STP`
---
.../CodeGen/AArch64/movimm-expand-ldst.mir | 40 +++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 1ec2a00f67690b..245c4c358626d1 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -31,3 +31,43 @@ body: |
; CHECK-NEXT: RET undef $lr, implicit $x0
renamable $x0 = MOVi64imm -4550323095879417536
RET_ReallyLR implicit $x0
+...
+---
+name: test_fold_repeating_constant_load_store_twice
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
+ ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
+ ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+ ; CHECK-NEXT: STRXui renamable $x8, killed renamable $x0, 0
+ ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x1, 0
+ ; CHECK-NEXT: RET undef $lr
+ renamable $x8 = MOVi64imm 90284035103834330
+ STRXui renamable $x8, killed renamable $x0, 0
+ STRXui killed renamable $x8, killed renamable $x1, 0
+ RET_ReallyLR
+...
+---
+name: test_fold_repeating_constant_load_use_reg_before_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0
+ ; CHECK-LABEL: name: test_fold_repeating_constant_load_use_reg_before_store
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
+ ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
+ ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+ ; CHECK-NEXT: renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+ ; CHECK-NEXT: STRXui renamable $x8, killed renamable $x0, 0
+ ; CHECK-NEXT: RET undef $lr
+ renamable $x8 = MOVi64imm 90284035103834330
+ renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+ STRXui renamable $x8, killed renamable $x0, 0
+ RET_ReallyLR
>From 1e6a5ffefd0a1fc2aee6115435457fcac843881a Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Sun, 21 Apr 2024 19:20:40 +0900
Subject: [PATCH 4/4] [AArch64] Optimize `MOV` to `STP` when load symmetric
constants (#51483)
This change looks for cases of symmetric constant loading.
`symmetric constant load` is when the upper 32 bits and lower 32 bits
of a 64-bit register load the same value.
When it finds this, it replaces it with an instruction that loads only
the lower 32 bits of the constant and stores it in the upper and lower
bits simultaneously.
For example:
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = MOVKXi $x8, 49370, 32
renamable $x8 = MOVKXi $x8, 320, 48
STRXui killed renamable $x8, killed renamable $x0, 0
becomes
$w8 = MOVZWi 49370, 0
$w8 = MOVKWi $w8, 320, 16
STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
---
.../AArch64/AArch64LoadStoreOptimizer.cpp | 173 ++++++++++++++++++
.../CodeGen/AArch64/movimm-expand-ldst.mir | 7 +-
2 files changed, 176 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d0adb78b231a76..c316bd66074c8f 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+ // Finds and collapses loads of repeated constant values.
+ bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
+ unsigned Limit);
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated);
+
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2259,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
+ auto MatchBaseReg = [&](unsigned Count) {
+ for (unsigned I = 0; I < Count; I++) {
+ auto OpI = MI.getOperand(I);
+ if (OpI.isReg() && OpI.getReg() != BaseReg)
+ return false;
+ }
+ return true;
+ };
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::MOVZXi:
+ return MatchBaseReg(1);
+ case AArch64::MOVKXi:
+ return MatchBaseReg(2);
+ case AArch64::ORRXrs:
+ case AArch64::ORRWrs:
+ MachineOperand &Imm = MI.getOperand(3);
+ unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+ if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
+ return true;
+ }
+
+ return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+ MachineBasicBlock::iterator FirstMovI;
+ MachineBasicBlock *MBB = MI.getParent();
+ uint64_t Mask = 0xFFFFUL;
+ int Index = 0;
+
+ for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+ if (Index == SuccIndex - 1) {
+ FirstMovI = *MI;
+ break;
+ }
+ (*MI)->eraseFromParent();
+ }
+
+ Register DstRegW =
+ TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
+ DstRegW)
+ .addImm(Accumulated & Mask)
+ .addImm(0);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
+ DstRegW)
+ .addUse(DstRegW)
+ .addImm((Accumulated >> 16) & Mask)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+ FirstMovI->eraseFromParent();
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+ DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+ unsigned DstRegState = getRegState(MI.getOperand(0));
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+ .addReg(DstRegW, DstRegState)
+ .addReg(DstRegW, DstRegState)
+ .addReg(MO.getReg(), getRegState(MO))
+ .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+ .setMemRefs(MI.memoperands())
+ .setMIFlags(MI.getFlags());
+ I->eraseFromParent();
+
+ return NextI;
+}
+
+bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
+ MachineBasicBlock::iterator &I, unsigned Limit) {
+ MachineInstr &MI = *I;
+ if (MI.getOpcode() != AArch64::STRXui)
+ return false;
+
+ MachineBasicBlock::iterator MBBI = I;
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ if (MBBI == B)
+ return false;
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
+ uint64_t Accumulated = 0;
+ SmallVector<MachineBasicBlock::iterator> MIs;
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
+
+ do {
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MI = *MBBI;
+ if (!MI.isTransient())
+ ++Count;
+ if (!isRepeatable(MI, BaseReg)) {
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+ TRI);
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
+ break;
+ continue;
+ }
+
+ unsigned Opc = MI.getOpcode();
+ if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
+ DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+ MIs.push_back(MBBI);
+ continue;
+ }
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
+ MachineOperand Value = MI.getOperand(ValueOrder);
+ MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+ if (!Value.isImm() || !Shift.isImm())
+ return false;
+
+ uint64_t IValue = Value.getImm();
+ uint64_t IShift = Shift.getImm();
+ uint64_t mask = 0xFFFFUL;
+ Accumulated -= (Accumulated & (mask << IShift));
+ Accumulated += (IValue << IShift);
+ MIs.push_back(MBBI);
+
+ if (ValueOrder == 1 && DupBitSize) {
+ Accumulated |= Accumulated << DupBitSize;
+ DupBitSize = 0;
+ }
+
+ if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
+ SuccIndex = MIs.size();
+ } while (MBBI != B && Count < Limit);
+
+ if (SuccIndex) {
+ I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2518,6 +2670,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
+ // us to load a 32-bit value only once.
+ // Considering :
+ // mov x8, 49370
+ // movk x8, 320, lsl #16
+ // movk x8, 49370, lsl #32
+ // movk x8, 320, lsl #48
+ // str x8, [x0]
+ // Transform :
+ // mov w8, 49370
+ // movk w8, 320, lsl #16
+ // stp w8, w8, [x0]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 245c4c358626d1..c6680cae7de40e 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -41,10 +41,9 @@ body: |
; CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
- ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
- ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
- ; CHECK-NEXT: STRXui renamable $x8, killed renamable $x0, 0
+ ; CHECK-NEXT: $w8 = MOVZWi 49370, 0
+ ; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
+ ; CHECK-NEXT: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x1, 0
; CHECK-NEXT: RET undef $lr
renamable $x8 = MOVi64imm 90284035103834330
More information about the llvm-commits
mailing list