[llvm] [AArch64] MI Scheduler: create more LDP/STP pairs (PR #77565)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 00:11:25 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sjoerd Meijer (sjoerdmeijer)
<details>
<summary>Changes</summary>
Target hook `canPairLdStOpc` is missing quite a few opcodes for which
LDPs/STPs can created. I was hoping that it would not be necessary to
add these missing opcodes here and that the attached motivating test
case would be handled by the LoadStoreOptimiser (especially after
https://github.com/llvm/llvm-project/pull/71908), but it's not. The problem is that after register allocation
some things are a lot harder to do. Consider this for the motivating example
```
[1] renamable $q1 = LDURQi renamable $x9, -16 :: (load (s128) from %ir.r51, align 8, !tbaa !0)
[2] renamable $q2 = LDURQi renamable $x0, -16 :: (load (s128) from %ir.r53, align 8, !tbaa !4)
[3] renamable $q1 = nnan ninf nsz arcp contract afn reassoc nofpexcept FMLSv2f64 killed renamable $q1(tied-def 0), killed renamable $q2, renamable $q0, implicit $fpcr
[4] STURQi killed renamable $q1, renamable $x9, -16 :: (store (s128) into %ir.r51, align 1, !tbaa !0)
[5] renamable $q1 = LDRQui renamable $x9, 0 :: (load (s128) from %ir.r.G0001_609.0, align 8, !tbaa !0)
```
We can't combine the the load in line [5] into the load on [1]: regisister q1 is
used in between. And we can can't combine [1] into [5]: it is aliasing with
the STR on line [4].
That's why I thought that adding this opcode pair here to the MI scheduler was
a good compromise.
---
Full diff: https://github.com/llvm/llvm-project/pull/77565.diff
4 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+3)
- (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+55-20)
- (modified) llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll (+41)
- (modified) llvm/test/CodeGen/AArch64/machine-combiner-copy.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1cfbf4737a6f72..42b7a6418032ad 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
switch (FirstOpc) {
default:
return false;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
case AArch64::LDRWui:
case AArch64::LDURWi:
return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index b435b3ce03e7ee..1551dc4435cea0 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1326,9 +1326,12 @@ static int alignTo(int Num, int PowOf2) {
static bool mayAlias(MachineInstr &MIa,
SmallVectorImpl<MachineInstr *> &MemInsns,
AliasAnalysis *AA) {
- for (MachineInstr *MIb : MemInsns)
- if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
+ for (MachineInstr *MIb : MemInsns) {
+ if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
+ LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
return true;
+ }
+ }
return false;
}
@@ -1525,7 +1528,7 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
// * collect the registers used and required register classes for RegToRename.
std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
bool IsDef) {
- LLVM_DEBUG(dbgs() << "Checking " << MI);
+ LLVM_DEBUG(dbgs() << "1.Checking " << MI);
// Currently we do not try to rename across frame-setup instructions.
if (MI.getFlag(MachineInstr::FrameSetup)) {
LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
@@ -1612,7 +1615,7 @@ static bool canRenameUntilSecondLoad(
bool Success = std::all_of(
FirstLoad.getIterator(), SecondLoad.getIterator(),
[&](MachineInstr &MI) {
- LLVM_DEBUG(dbgs() << "Checking " << MI);
+ LLVM_DEBUG(dbgs() << "2.Checking " << MI);
// Currently we do not try to rename across frame-setup instructions.
if (MI.getFlag(MachineInstr::FrameSetup)) {
LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
@@ -1757,9 +1760,11 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// Remember any instructions that read/write memory between FirstMI and MI.
SmallVector<MachineInstr *, 4> MemInsns;
+ LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
for (unsigned Count = 0; MBBI != E && Count < Limit;
MBBI = next_nodbg(MBBI, E)) {
MachineInstr &MI = *MBBI;
+ LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
UsedInBetween.accumulate(MI);
@@ -1859,6 +1864,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
UsedRegUnits, TRI);
MemInsns.push_back(&MI);
+ LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
+ << "keep looking.\n");
continue;
}
// If the alignment requirements of the paired (scaled) instruction
@@ -1868,6 +1875,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
UsedRegUnits, TRI);
MemInsns.push_back(&MI);
+ LLVM_DEBUG(dbgs()
+ << "Offset doesn't fit due to alignment requirements, "
+ << "keep looking.\n");
continue;
}
}
@@ -1884,14 +1894,25 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
Reg, getLdStRegOp(MI).getReg());
- // If the Rt of the second instruction was not modified or used between
- // the two instructions and none of the instructions between the second
- // and first alias with the second, we can combine the second into the
- // first.
- if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
- !(MI.mayLoad() && !SameLoadReg &&
- !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
- !mayAlias(MI, MemInsns, AA)) {
+ // If the Rt of the second instruction (destination register of the
+ // load) was not modified or used between the two instructions and none
+ // of the instructions between the second and first alias with the
+ // second, we can combine the second into the first.
+ bool RtNotModified =
+ ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
+ bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
+ !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
+ bool DontAlias = !mayAlias(MI, MemInsns, AA);
+
+ LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
+ << "Reg '" << getLdStRegOp(MI) << "' not modified: "
+ << (RtNotModified ? "true" : "false") << "\n"
+ << "Reg '" << getLdStRegOp(MI) << "' not used: "
+ << (RtNotUsed ? "true" : "false") << "\n"
+ << "Does not alias with other mem insns: "
+ << (DontAlias ? "true" : "false") << "\n");
+
+ if (RtNotModified && RtNotUsed && DontAlias) {
// For pairs loading into the same reg, try to find a renaming
// opportunity to allow the renaming of Reg between FirstMI and MI
// and combine MI into FirstMI; otherwise bail and keep looking.
@@ -1904,6 +1925,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
UsedRegUnits, TRI);
MemInsns.push_back(&MI);
+ LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
+ << "keep looking.\n");
continue;
}
Flags.setRenameReg(*RenameReg);
@@ -1919,10 +1942,18 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// between the two instructions and none of the instructions between the
// first and the second alias with the first, we can combine the first
// into the second.
- if (!(MayLoad &&
- !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
- !mayAlias(FirstMI, MemInsns, AA)) {
-
+ RtNotModified = !(
+ MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
+ DontAlias = !mayAlias(FirstMI, MemInsns, AA);
+
+ LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
+ << "Reg '" << getLdStRegOp(FirstMI)
+ << "' not modified: "
+ << (RtNotModified ? "true" : "false") << "\n"
+ << "Does not alias with other mem insns: "
+ << (DontAlias ? "true" : "false") << "\n");
+
+ if (RtNotModified && DontAlias) {
if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
Flags.setMergeForward(true);
Flags.clearRenameReg();
@@ -1938,8 +1969,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
MBBIWithRenameReg = MBBI;
}
}
- // Unable to combine these instructions due to interference in between.
- // Keep looking.
+ LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
+ << "interference in between, keep looking.\n");
}
}
@@ -1948,16 +1979,20 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// If the instruction wasn't a matching load or store. Stop searching if we
// encounter a call instruction that might modify memory.
- if (MI.isCall())
+ if (MI.isCall()) {
+ LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
return E;
+ }
// Update modified / uses register units.
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
// Otherwise, if the base register is modified, we have no match, so
// return early.
- if (!ModifiedRegUnits.available(BaseReg))
+ if (!ModifiedRegUnits.available(BaseReg)) {
+ LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
return E;
+ }
// Update list of instructions that read/write memory.
if (MI.mayLoadOrStore())
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 4fa34e846b2061..83f86d1c3a7cbf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -113,3 +113,44 @@ define <2 x i64> @ldq_cluster(ptr %p) {
%res = mul nsw <2 x i64> %tmp2, %tmp3
ret <2 x i64> %res
}
+
+; Test LDURQi / LDRQui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURQi_LDRQui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(0) - SU(4)
+; CHECK: Cluster ld/st SU(1) - SU(5)
+;
+; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui
+; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui
+;
+define void @LDURQi_LDRQui(ptr nocapture readonly %arg) {
+entry:
+ br label %vector_body
+vector_body:
+ %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+ %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+ %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+ %r51 = getelementptr i8, ptr %phi1, i64 -16
+ %r52 = load <2 x double>, ptr %r51, align 8
+ %r53 = getelementptr i8, ptr %phi2, i64 -16
+ %r54 = load <2 x double>, ptr %r53, align 8
+ %r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0>
+ %r56 = fsub fast <2 x double> %r52, %r55
+ store <2 x double> %r56, ptr %r51, align 1
+ %r57 = load <2 x double>, ptr %phi1, align 8
+ %r58 = load <2 x double>, ptr %phi2, align 8
+ %r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0>
+ %r60 = fsub fast <2 x double> %r57, %r59
+ store <2 x double> %r60, ptr %phi1, align 1
+ %r61 = add i32 %phi3, 4
+ %r62 = getelementptr i8, ptr %phi2, i64 32
+ %r63 = getelementptr i8, ptr %phi1, i64 32
+ %r.not = icmp eq i32 %r61, 0
+ br i1 %r.not, label %exit, label %vector_body
+exit:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 15a484d11b0a1d..4c8e589391c3ac 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -20,13 +20,13 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
; CHECK-NEXT: mov x12, x9
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q3, [x11, #-16]
+; CHECK-NEXT: ldp q1, q4, [x10, #-16]
; CHECK-NEXT: subs x12, x12, #16
-; CHECK-NEXT: ldp q2, q4, [x10, #-16]
+; CHECK-NEXT: ldp q2, q3, [x11, #-16]
; CHECK-NEXT: add x11, x11, #32
-; CHECK-NEXT: fmla v2.8h, v1.8h, v0.h[0]
+; CHECK-NEXT: fmla v1.8h, v2.8h, v0.h[0]
; CHECK-NEXT: fmla v4.8h, v3.8h, v0.h[0]
-; CHECK-NEXT: stp q2, q4, [x10, #-16]
+; CHECK-NEXT: stp q1, q4, [x10, #-16]
; CHECK-NEXT: add x10, x10, #32
; CHECK-NEXT: b.ne .LBB0_4
; CHECK-NEXT: // %bb.5: // %middle.block
``````````
</details>
https://github.com/llvm/llvm-project/pull/77565
More information about the llvm-commits
mailing list