[llvm] [AArch64] Reland merge index address with large offset into base address (PR #79951)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 29 23:03:34 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Allen (vfdff)
<details>
<summary>Changes</summary>
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
```
Fold
mov w8, https://github.com/llvm/llvm-project/issues/56952
movk w8, https://github.com/llvm/llvm-project/pull/15, lsl https://github.com/llvm/llvm-project/pull/16
ldrb w0, [x0, x8]
into
add x0, x0, 1036288
ldrb w0, [x0, 3704]
```
Only LDRBBroX is supported for the first time.
Fix https://github.com/llvm/llvm-project/issues/71917
---
Full diff: https://github.com/llvm/llvm-project/pull/79951.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+10)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+3)
- (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+232)
- (modified) llvm/test/CodeGen/AArch64/arm64-addrmode.ll (+6-9)
- (modified) llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir (+22-3)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 13e9d9725cc2e..2e8d8c63d6bec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4098,6 +4098,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
return MI.getOperand(Idx);
}
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case AArch64::LDRBBroX:
+ return MI.getOperand(4);
+ }
+}
+
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
Register Reg) {
if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 6526f6740747a..db24a19fe5f8e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
/// Returns the immediate offset operator of a load/store.
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
+ /// Returns the shift amount operator of a load/store.
+ static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
/// Returns whether the instruction is FP or NEON.
static bool isFpOrNEON(const MachineInstr &MI);
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 926a89466255c..8be19652edbb5 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+ "Number of const offset of index address folded");
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
"Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
cl::Hidden);
+// The LdStConstLimit limits how far we search for const offset instructions
+// when we form index address load/store instructions.
+static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
+ cl::init(10), cl::Hidden);
+
// Enable register renaming to find additional store pairing opportunities.
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
int UnscaledOffset, unsigned Limit);
+ // Scan the instruction list to find a register assigned with a const
+ // value that can be combined with the current instruction (a load or store)
+ // using base addressing with writeback. Scan forwards.
+ MachineBasicBlock::iterator
+ findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+ unsigned &Offset);
+
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
unsigned BaseReg, int Offset);
+ bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+ unsigned IndexReg, unsigned &Offset);
+
// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);
+ MachineBasicBlock::iterator
+ mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update, unsigned Offset,
+ int Scale);
+
// Find and merge zero store instructions.
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
+ bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
+
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
}
}
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+ // TODO: Add more index address loads/stores.
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no base address equivalent!");
+ case AArch64::LDRBBroX:
+ return AArch64::LDRBBui;
+ }
+}
+
static unsigned getPostIndexedOpcode(unsigned Opc) {
switch (Opc) {
default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
}
}
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ return false;
+ // Scaled instructions.
+ // TODO: Add more index address loads/stores.
+ case AArch64::LDRBBroX:
+ Scale = 1;
+ return true;
+ }
+}
+
static bool isRewritableImplicitDef(unsigned Opc) {
switch (Opc) {
default:
@@ -2048,6 +2097,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update,
+ unsigned Offset, int Scale) {
+ assert((Update->getOpcode() == AArch64::MOVKWi) &&
+ "Unexpected const mov instruction to merge!");
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+ MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+ MachineInstr &MemMI = *I;
+ unsigned Mask = (1 << 12) * Scale - 1;
+ unsigned Low = Offset & Mask;
+ unsigned High = Offset - Low;
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+ MachineInstrBuilder AddMIB, MemMIB;
+
+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+ AddMIB =
+ BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+ .addDef(IndexReg)
+ .addUse(BaseReg)
+ .addImm(High >> 12) // shifted value
+ .addImm(12); // shift 12
+ (void)AddMIB;
+ // Ld/St DestReg, IndexReg, Imm12
+ unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+ MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .add(getLdStRegOp(MemMI))
+ .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+ .addImm(Low / Scale)
+ .setMemRefs(I->memoperands())
+ .setMIFlags(I->mergeFlagsWith(*Update));
+ (void)MemMIB;
+
+ ++NumConstOffsetFolded;
+ LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+ LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
+ LLVM_DEBUG(PrevI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(Update->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions for the block.
+ I->eraseFromParent();
+ PrevI->eraseFromParent();
+ Update->eraseFromParent();
+
+ return NextI;
+}
+
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
MachineInstr &MI,
unsigned BaseReg, int Offset) {
@@ -2095,6 +2201,34 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
return false;
}
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+ MachineInstr &MI,
+ unsigned IndexReg,
+ unsigned &Offset) {
+ // The update instruction source and destination register must be the
+ // same as the load/store index register.
+ if (MI.getOpcode() == AArch64::MOVKWi &&
+ TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+ // movz + movk hold a large offset of a Ld/St instruction.
+ MachineBasicBlock::iterator B = MI.getParent()->begin();
+ MachineBasicBlock::iterator MBBI = &MI;
+ // Skip the scene when the MI is the first instruction of a block.
+ if (MBBI == B)
+ return false;
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MovzMI = *MBBI;
+ if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+ unsigned Low = MovzMI.getOperand(1).getImm();
+ unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+ Offset = High + Low;
+ // 12-bit optionally shifted immediates are legal for adds.
+ return Offset >> 24 == 0;
+ }
+ }
+ return false;
+}
+
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2250,6 +2384,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr &MemMI = *I;
+ MachineBasicBlock::iterator MBBI = I;
+
+ // If the load is the first instruction in the block, there's obviously
+ // not any matching load or store.
+ if (MBBI == B)
+ return E;
+
+ // Make sure the IndexReg is killed and the shift amount is zero.
+ // TODO: Relex this restriction to extend, simplify processing now.
+ if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
+ !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+ (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+ return E;
+
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+ // Track which register units have been modified and used between the first
+ // insn (inclusive) and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
+ unsigned Count = 0;
+ do {
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+ return MBBI;
+ }
+
+ // Update the status of what the instruction clobbered and used.
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+ // Otherwise, if the index register is used or modified, we have no match,
+ // so return early.
+ if (!ModifiedRegUnits.available(IndexReg) ||
+ !UsedRegUnits.available(IndexReg))
+ return E;
+
+ } while (MBBI != B && Count < Limit);
+ return E;
+}
+
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2434,6 +2622,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
return false;
}
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
+ int Scale) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
+ MachineBasicBlock::iterator Update;
+
+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
+ if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+ return false;
+
+ // Look back to try to find a const offset for index LdSt instruction. For
+ // example,
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
+ // ldr x1, [x0, x8]
+ // merged into:
+ // add x8, x0, a * (1<<12)
+ // ldr x1, [x8, imm12]
+ unsigned Offset;
+ Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
+ if (Update != E && (Offset & (Scale - 1)) == 0) {
+ // Merge the imm12 into the ld/st.
+ MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool EnableNarrowZeroStOpt) {
@@ -2512,6 +2728,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
+ // 5) Find a register assigned with a const value that can be combined with
+ // into the load or store. e.g.,
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
+ // ldr x1, [x0, x8]
+ // ; becomes
+ // add x8, x0, a * (1<<12)
+ // ldr x1, [x8, imm12]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ int Scale;
+ if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index d39029163a47a..2181eaaee7db6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -214,9 +214,8 @@ define void @t17(i64 %a) {
define i8 @LdOffset_i8(ptr %a) {
; CHECK-LABEL: LdOffset_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #56952 // =0xde78
-; CHECK-NEXT: movk w8, #15, lsl #16
-; CHECK-NEXT: ldrb w0, [x0, x8]
+; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT: ldrb w0, [x8, #3704]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
%val = load i8, ptr %arrayidx, align 1
@@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a) {
define i32 @LdOffset_i8_zext32(ptr %a) {
; CHECK-LABEL: LdOffset_i8_zext32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #56952 // =0xde78
-; CHECK-NEXT: movk w8, #15, lsl #16
-; CHECK-NEXT: ldrb w0, [x0, x8]
+; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT: ldrb w0, [x8, #3704]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
%val = load i8, ptr %arrayidx, align 1
@@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a) {
define i64 @LdOffset_i8_zext64(ptr %a) {
; CHECK-LABEL: LdOffset_i8_zext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #56952 // =0xde78
-; CHECK-NEXT: movk w8, #15, lsl #16
-; CHECK-NEXT: ldrb w0, [x0, x8]
+; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT: ldrb w0, [x8, #3704]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
%val = load i8, ptr %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
index 488f1ffdb52f3..473c74323d939 100755
--- a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -14,9 +14,8 @@ body: |
; CHECK-LABEL: name: LdOffset
; CHECK: liveins: $x0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
- ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
- ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+ ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+ ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
; CHECK-NEXT: RET undef $lr, implicit $w0
renamable $w8 = MOVZWi 56952, 0
renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
@@ -46,3 +45,23 @@ body: |
renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
RET undef $lr, implicit $w0
...
+
+# Negative test: No MOVZWi used for the const offset
+---
+name: LdOffset_missing_MOVZ
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '' }
+body: |
+ bb.0.entry:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: LdOffset_missing_MOVZ
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+ ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+ ; CHECK-NEXT: RET undef $lr, implicit $w0
+ renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+ renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+ RET undef $lr, implicit $w0
``````````
</details>
https://github.com/llvm/llvm-project/pull/79951
More information about the llvm-commits
mailing list