[llvm] [LSR] Account for hardware loop instructions (PR #147958)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 10 06:18:08 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: John Brawn (john-brawn-arm)
<details>
<summary>Changes</summary>
A hardware loop instruction combines a subtract, compare with zero, and branch. We currently account for the compare and branch being combined into one in Cost::RateFormula, as part of more general handling for compare-branch-zero, but don't account for the subtract, leading to suboptimal decisions in some cases.
Fix this in Cost::RateRegister by noticing when we have such a subtract and discounting the AddRecCost in such a case.
---
Patch is 22.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147958.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp (+50-19)
- (added) llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll (+300)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll (+27-22)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 845afa6d4228b..c9a3e477ad86c 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -521,6 +521,8 @@ struct Formula {
bool hasZeroEnd() const;
+ bool countsDownToZero() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -705,6 +707,16 @@ bool Formula::hasZeroEnd() const {
return true;
}
+bool Formula::countsDownToZero() const {
+ if (!hasZeroEnd())
+ return false;
+ assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
+ const APInt *StepInt;
+ if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
+ return false;
+ return StepInt->isNegative();
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -1227,10 +1239,9 @@ class Cost {
return C.NumRegs == ~0u;
}
- void RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
+ void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
+ bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
void print(raw_ostream &OS) const;
@@ -1238,9 +1249,11 @@ class Cost {
private:
void RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs);
+ SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+ bool HardwareLoopProfitable);
void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
@@ -1383,7 +1396,8 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
/// Tally up interesting quantities from the given register.
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs) {
+ SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+ bool HardwareLoopProfitable) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
// If this is an addrec for another loop, it should be an invariant
// with respect to L since L is the innermost loop (at least
@@ -1419,13 +1433,18 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
SE->isLoopInvariant(Start, L)))
LoopCost = 0;
}
+ // If the loop counts down to zero and we'll be using a hardware loop then
+ // the addrec will be combined into the hardware loop instruction.
+ if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
+ HardwareLoopProfitable)
+ LoopCost = 0;
C.AddRecCost += LoopCost;
// Add the step value register, if it needs one.
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
- RateRegister(F, AR->getOperand(1), Regs);
+ RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
if (isLoser())
return;
}
@@ -1448,22 +1467,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
/// one of those regs an instant loser.
void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
if (Regs.insert(Reg).second) {
- RateRegister(F, Reg, Regs);
+ RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
}
-void Cost::RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
+void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (isLoser())
return;
@@ -1477,7 +1496,8 @@ void Cost::RateFormula(const Formula &F,
Lose();
return;
}
- RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
+ RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
+ LoserRegs);
if (isLoser())
return;
}
@@ -1486,7 +1506,8 @@ void Cost::RateFormula(const Formula &F,
Lose();
return;
}
- RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
+ RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
+ LoserRegs);
if (isLoser())
return;
}
@@ -2112,6 +2133,7 @@ class LSRInstance {
TTI::AddressingModeKind AMK;
mutable SCEVExpander Rewriter;
bool Changed = false;
+ bool HardwareLoopProfitable = false;
/// This is the insert position that the current loop's induction variable
/// increment should be placed. In simple loops, this is the latch block's
@@ -3590,7 +3612,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
Formula F;
F.initialMatch(S, L, SE);
- BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
+ BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
+ HardwareLoopProfitable);
VisitedLSRUse.insert(LUIdx);
}
@@ -4728,7 +4751,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// the corresponding bad register from the Regs set.
Cost CostF(L, SE, TTI, AMK);
Regs.clear();
- CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
+ CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
+ &LoserRegs);
if (CostF.isLoser()) {
// During initial formula generation, undesirable formulae are generated
// by uses within other loops that have some non-trivial address mode or
@@ -4761,7 +4785,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Cost CostBest(L, SE, TTI, AMK);
Regs.clear();
- CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+ CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
+ HardwareLoopProfitable);
if (CostF.isLess(CostBest))
std::swap(F, Best);
LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
@@ -5019,9 +5044,9 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
Cost CostFA(L, SE, TTI, AMK);
Cost CostFB(L, SE, TTI, AMK);
Regs.clear();
- CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
+ CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
Regs.clear();
- CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+ CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
return CostFA.isLess(CostFB);
};
@@ -5426,7 +5451,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
- NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+ NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
if (NewCost.isLess(SolutionCost)) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
@@ -6131,6 +6156,12 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
dbgs() << ":\n");
+ // Check if we expect this loop to use a hardware loop instruction, which will
+ // be used when calculating the costs of formulas.
+ HardwareLoopInfo HWLoopInfo(L);
+ HardwareLoopProfitable =
+ TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
+
// Configure SCEVExpander already now, so the correct mode is used for
// isSafeToExpand() checks.
#if LLVM_ENABLE_ABI_BREAKING_CHECKS
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
new file mode 100644
index 0000000000000..037b272f60ec7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOMVE
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob,+mve --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
+
+; Check that loop strength reduction understands that it can fold a sub into an
+; le instruction and reduces the cost appropriately, causing it to do this no
+; matter the preferred addressing mode.
+
+define void @test(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push {r7, lr}
+; CHECK-NOMVE-NEXT: add.w r0, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: movs r2, #0
+; CHECK-NOMVE-NEXT: sub.w r12, r0, #2
+; CHECK-NOMVE-NEXT: movs r3, #0
+; CHECK-NOMVE-NEXT: .LBB0_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB0_2 Depth 2
+; CHECK-NOMVE-NEXT: dls lr, r1
+; CHECK-NOMVE-NEXT: mov r0, r12
+; CHECK-NOMVE-NEXT: .LBB0_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB0_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: strh r2, [r0, #2]!
+; CHECK-NOMVE-NEXT: le lr, .LBB0_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB0_1 Depth=1
+; CHECK-NOMVE-NEXT: adds r3, #1
+; CHECK-NOMVE-NEXT: cmp r3, r1
+; CHECK-NOMVE-NEXT: it eq
+; CHECK-NOMVE-NEXT: popeq {r7, pc}
+; CHECK-NOMVE-NEXT: b .LBB0_1
+;
+; CHECK-MVE-LABEL: test:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push {r7, lr}
+; CHECK-MVE-NEXT: add.w r12, r0, r1, lsl #1
+; CHECK-MVE-NEXT: movs r2, #0
+; CHECK-MVE-NEXT: movs r3, #0
+; CHECK-MVE-NEXT: .LBB0_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB0_2 Depth 2
+; CHECK-MVE-NEXT: dls lr, r1
+; CHECK-MVE-NEXT: mov r0, r12
+; CHECK-MVE-NEXT: .LBB0_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB0_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: strh r2, [r0], #2
+; CHECK-MVE-NEXT: le lr, .LBB0_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB0_1 Depth=1
+; CHECK-MVE-NEXT: adds r3, #1
+; CHECK-MVE-NEXT: cmp r3, r1
+; CHECK-MVE-NEXT: it eq
+; CHECK-MVE-NEXT: popeq {r7, pc}
+; CHECK-MVE-NEXT: b .LBB0_1
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+define void @test_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-LABEL: test_optsize:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: add.w r12, r0, r1, lsl #1
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: .LBB1_1: @ %outer_loop
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB1_2 Depth 2
+; CHECK-NEXT: dls lr, r1
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: .LBB1_2: @ %inner_loop
+; CHECK-NEXT: @ Parent Loop BB1_1 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: strh r2, [r0], #2
+; CHECK-NEXT: le lr, .LBB1_2
+; CHECK-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: adds r3, #1
+; CHECK-NEXT: cmp r3, r1
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r7, pc}
+; CHECK-NEXT: b .LBB1_1
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+; Check that when we can't use LE we don't discount the cost of a sub
+; instruction, so we only get it when postincrement is the preferred addressing
+; mode (i.e. when we have mve).
+
+declare void @otherfn()
+
+define void @test_no_le(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test_no_le:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT: add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: mov r4, r1
+; CHECK-NOMVE-NEXT: movs r6, #0
+; CHECK-NOMVE-NEXT: mov.w r8, #0
+; CHECK-NOMVE-NEXT: .LBB2_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB2_2 Depth 2
+; CHECK-NOMVE-NEXT: movs r7, #0
+; CHECK-NOMVE-NEXT: .LBB2_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB2_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: bl otherfn
+; CHECK-NOMVE-NEXT: strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT: adds r7, #1
+; CHECK-NOMVE-NEXT: cmp r4, r7
+; CHECK-NOMVE-NEXT: bne .LBB2_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NOMVE-NEXT: add.w r8, r8, #1
+; CHECK-NOMVE-NEXT: cmp r8, r4
+; CHECK-NOMVE-NEXT: bne .LBB2_1
+; CHECK-NOMVE-NEXT: @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT: sub sp, #4
+; CHECK-MVE-NEXT: add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT: mov r9, r1
+; CHECK-MVE-NEXT: movs r6, #0
+; CHECK-MVE-NEXT: movs r7, #0
+; CHECK-MVE-NEXT: .LBB2_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB2_2 Depth 2
+; CHECK-MVE-NEXT: mov r5, r8
+; CHECK-MVE-NEXT: mov r4, r9
+; CHECK-MVE-NEXT: .LBB2_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB2_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: bl otherfn
+; CHECK-MVE-NEXT: strh r6, [r5], #2
+; CHECK-MVE-NEXT: subs r4, #1
+; CHECK-MVE-NEXT: bne .LBB2_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB2_1 Depth=1
+; CHECK-MVE-NEXT: adds r7, #1
+; CHECK-MVE-NEXT: cmp r7, r9
+; CHECK-MVE-NEXT: bne .LBB2_1
+; CHECK-MVE-NEXT: @ %bb.4: @ %exit
+; CHECK-MVE-NEXT: add sp, #4
+; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ call void @otherfn()
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+define void @test_no_le_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-NOMVE-LABEL: test_no_le_optsize:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT: add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: mov r4, r1
+; CHECK-NOMVE-NEXT: movs r6, #0
+; CHECK-NOMVE-NEXT: mov.w r8, #0
+; CHECK-NOMVE-NEXT: .LBB3_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB3_2 Depth 2
+; CHECK-NOMVE-NEXT: movs r7, #0
+; CHECK-NOMVE-NEXT: .LBB3_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB3_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: bl otherfn
+; CHECK-NOMVE-NEXT: strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT: adds r7, #1
+; CHECK-NOMVE-NEXT: cmp r4, r7
+; CHECK-NOMVE-NEXT: bne .LBB3_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB3_1 Depth=1
+; CHECK-NOMVE-NEXT: add.w r8, r8, #1
+; CHECK-NOMVE-NEXT: cmp r8, r4
+; CHECK-NOMVE-NEXT: bne .LBB3_1
+; CHECK-NOMVE-NEXT: @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le_optsize:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT: sub sp, #4
+; CHECK-MVE-NEXT: add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT: mov r9, r1
+; CHECK-MVE-NEXT: movs r6, #0
+; CHECK-MVE-NEXT: movs r7, #0
+; CHECK-MVE-NEXT: .LBB3_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB3_2 Depth 2
+; CHECK-MVE-NEXT: mov r5, r8
+; CHECK-MVE-NEXT: mov r4, r9
+; CHECK-MVE-NEXT: .LBB3_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB3_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: bl otherfn
+; CHECK-MVE-NEXT: strh r6, [r5], #2
+; CHECK-MVE-NEXT: subs r4, #1
+; CHECK-MVE-NEXT: bne .LBB3_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB3_1 Depth=1
+; CHECK-MVE-NEXT: adds r7, #1
+; CHECK-MVE-NEXT: cmp r7, r9
+; CHECK-MVE-NEXT: bne .LBB3_1
+; CHECK-MVE-NEXT: @ %bb.4: @ %exit
+; CHECK-MVE-NEXT: add sp, #4
+; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ call void @otherfn()
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 1f3a43923db61..c6158cb611a70 100644
--- a/llv...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/147958
More information about the llvm-commits
mailing list