[llvm] f8c2c4f - [LSR] Account for hardware loop instructions (#147958)

Mon Jul 14 08:48:58 PDT 2025

Author: John Brawn
Date: 2025-07-14T16:48:54+01:00
New Revision: f8c2c4f161a6a98f623a6263a248fb24927eeaec

URL: https://github.com/llvm/llvm-project/commit/f8c2c4f161a6a98f623a6263a248fb24927eeaec
DIFF: https://github.com/llvm/llvm-project/commit/f8c2c4f161a6a98f623a6263a248fb24927eeaec.diff

LOG: [LSR] Account for hardware loop instructions (#147958)

A hardware loop instruction combines a subtract, compare with zero, and
branch. We currently account for the compare and branch being combined
into one in Cost::RateFormula, as part of more general handling for
compare-branch-zero, but don't account for the subtract, leading to
suboptimal decisions in some cases.

Fix this in Cost::RateRegister by noticing when we have such a subtract
and discounting the AddRecCost in such a case.

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
    llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9e8d2f2230acc..32616bfe68ca0 100644

--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -521,6 +521,8 @@ struct Formula {
 
   bool hasZeroEnd() const;
 
+  bool countsDownToZero() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -705,6 +707,16 @@ bool Formula::hasZeroEnd() const {
   return true;
 }
 
+bool Formula::countsDownToZero() const {
+  if (!hasZeroEnd())
+    return false;
+  assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
+  const APInt *StepInt;
+  if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
+    return false;
+  return StepInt->isNegative();
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -1227,10 +1239,9 @@ class Cost {
     return C.NumRegs == ~0u;
   }
 
-  void RateFormula(const Formula &F,
-                   SmallPtrSetImpl<const SCEV *> &Regs,
-                   const DenseSet<const SCEV *> &VisitedRegs,
-                   const LSRUse &LU,
+  void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
+                   const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
+                   bool HardwareLoopProfitable,
                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
 
   void print(raw_ostream &OS) const;
@@ -1238,9 +1249,11 @@ class Cost {
 
 private:
   void RateRegister(const Formula &F, const SCEV *Reg,
-                    SmallPtrSetImpl<const SCEV *> &Regs);
+                    SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+                    bool HardwareLoopProfitable);
   void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                            SmallPtrSetImpl<const SCEV *> &Regs,
+                           const LSRUse &LU, bool HardwareLoopProfitable,
                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
 };
 
@@ -1383,7 +1396,8 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
 
 /// Tally up interesting quantities from the given register.
 void Cost::RateRegister(const Formula &F, const SCEV *Reg,
-                        SmallPtrSetImpl<const SCEV *> &Regs) {
+                        SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+                        bool HardwareLoopProfitable) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
     // If this is an addrec for another loop, it should be an invariant
     // with respect to L since L is the innermost loop (at least
@@ -1419,13 +1433,18 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
              SE->isLoopInvariant(Start, L)))
           LoopCost = 0;
     }
+    // If the loop counts down to zero and we'll be using a hardware loop then
+    // the addrec will be combined into the hardware loop instruction.
+    if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
+        HardwareLoopProfitable)
+      LoopCost = 0;
     C.AddRecCost += LoopCost;
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
       if (!Regs.count(AR->getOperand(1))) {
-        RateRegister(F, AR->getOperand(1), Regs);
+        RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
         if (isLoser())
           return;
       }
@@ -1448,22 +1467,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
 /// one of those regs an instant loser.
 void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                                SmallPtrSetImpl<const SCEV *> &Regs,
+                               const LSRUse &LU, bool HardwareLoopProfitable,
                                SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   if (LoserRegs && LoserRegs->count(Reg)) {
     Lose();
     return;
   }
   if (Regs.insert(Reg).second) {
-    RateRegister(F, Reg, Regs);
+    RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
     if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
 }
 
-void Cost::RateFormula(const Formula &F,
-                       SmallPtrSetImpl<const SCEV *> &Regs,
+void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
-                       const LSRUse &LU,
+                       const LSRUse &LU, bool HardwareLoopProfitable,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   if (isLoser())
     return;
@@ -1477,7 +1496,8 @@ void Cost::RateFormula(const Formula &F,
       Lose();
       return;
     }
-    RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
+    RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
+                        LoserRegs);
     if (isLoser())
       return;
   }
@@ -1486,7 +1506,8 @@ void Cost::RateFormula(const Formula &F,
       Lose();
       return;
     }
-    RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
+    RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
+                        LoserRegs);
     if (isLoser())
       return;
   }
@@ -2112,6 +2133,7 @@ class LSRInstance {
   TTI::AddressingModeKind AMK;
   mutable SCEVExpander Rewriter;
   bool Changed = false;
+  bool HardwareLoopProfitable = false;
 
   /// This is the insert position that the current loop's induction variable
   /// increment should be placed. In simple loops, this is the latch block's
@@ -3592,7 +3614,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
       Formula F;
       F.initialMatch(S, L, SE);
-      BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
+      BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
+                               HardwareLoopProfitable);
       VisitedLSRUse.insert(LUIdx);
     }
 
@@ -4730,7 +4753,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
       // the corresponding bad register from the Regs set.
       Cost CostF(L, SE, TTI, AMK);
       Regs.clear();
-      CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
+      CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
+                        &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
         // by uses within other loops that have some non-trivial address mode or
@@ -4763,7 +4787,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
         Cost CostBest(L, SE, TTI, AMK);
         Regs.clear();
-        CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+        CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
+                             HardwareLoopProfitable);
         if (CostF.isLess(CostBest))
           std::swap(F, Best);
         LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -5021,9 +5046,9 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
       Cost CostFA(L, SE, TTI, AMK);
       Cost CostFB(L, SE, TTI, AMK);
       Regs.clear();
-      CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
+      CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
       Regs.clear();
-      CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+      CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
       return CostFA.isLess(CostFB);
     };
 
@@ -5428,7 +5453,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // the current best, prune the search at that point.
     NewCost = CurCost;
     NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
     if (NewCost.isLess(SolutionCost)) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
@@ -6133,6 +6158,12 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
              L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
              dbgs() << ":\n");
 
+  // Check if we expect this loop to use a hardware loop instruction, which will
+  // be used when calculating the costs of formulas.
+  HardwareLoopInfo HWLoopInfo(L);
+  HardwareLoopProfitable =
+      TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
+
   // Configure SCEVExpander already now, so the correct mode is used for
   // isSafeToExpand() checks.
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
new file mode 100644
index 0000000000000..037b272f60ec7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOMVE
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob,+mve --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
+
+; Check that loop strength reduction understands that it can fold a sub into an
+; le instruction and reduces the cost appropriately, causing it to do this no
+; matter the preferred addressing mode.
+
+define void @test(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test:
+; CHECK-NOMVE:       @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT:    push {r7, lr}
+; CHECK-NOMVE-NEXT:    add.w r0, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT:    movs r2, #0
+; CHECK-NOMVE-NEXT:    sub.w r12, r0, #2
+; CHECK-NOMVE-NEXT:    movs r3, #0
+; CHECK-NOMVE-NEXT:  .LBB0_1: @ %outer_loop
+; CHECK-NOMVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT:    @ Child Loop BB0_2 Depth 2
+; CHECK-NOMVE-NEXT:    dls lr, r1
+; CHECK-NOMVE-NEXT:    mov r0, r12
+; CHECK-NOMVE-NEXT:  .LBB0_2: @ %inner_loop
+; CHECK-NOMVE-NEXT:    @ Parent Loop BB0_1 Depth=1
+; CHECK-NOMVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT:    strh r2, [r0, #2]!
+; CHECK-NOMVE-NEXT:    le lr, .LBB0_2
+; CHECK-NOMVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT:    @ in Loop: Header=BB0_1 Depth=1
+; CHECK-NOMVE-NEXT:    adds r3, #1
+; CHECK-NOMVE-NEXT:    cmp r3, r1
+; CHECK-NOMVE-NEXT:    it eq
+; CHECK-NOMVE-NEXT:    popeq {r7, pc}
+; CHECK-NOMVE-NEXT:    b .LBB0_1
+;
+; CHECK-MVE-LABEL: test:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    push {r7, lr}
+; CHECK-MVE-NEXT:    add.w r12, r0, r1, lsl #1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    movs r3, #0
+; CHECK-MVE-NEXT:  .LBB0_1: @ %outer_loop
+; CHECK-MVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT:    @ Child Loop BB0_2 Depth 2
+; CHECK-MVE-NEXT:    dls lr, r1
+; CHECK-MVE-NEXT:    mov r0, r12
+; CHECK-MVE-NEXT:  .LBB0_2: @ %inner_loop
+; CHECK-MVE-NEXT:    @ Parent Loop BB0_1 Depth=1
+; CHECK-MVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT:    strh r2, [r0], #2
+; CHECK-MVE-NEXT:    le lr, .LBB0_2
+; CHECK-MVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT:    @ in Loop: Header=BB0_1 Depth=1
+; CHECK-MVE-NEXT:    adds r3, #1
+; CHECK-MVE-NEXT:    cmp r3, r1
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    popeq {r7, pc}
+; CHECK-MVE-NEXT:    b .LBB0_1
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+  br label %inner_loop
+
+inner_loop:
+  %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+  %add = add i32 %idx_inner, %n
+  %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+  store i16 0, ptr %gep, align 2
+  %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+  %cond_inner = icmp eq i32 %idx_inner.inc, %n
+  br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+  %idx_outer.inc = add nuw i32 %idx_outer, 1
+  %cond_outer = icmp eq i32 %idx_outer.inc, %n
+  br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+  ret void
+}
+
+define void @test_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-LABEL: test_optsize:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    add.w r12, r0, r1, lsl #1
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:  .LBB1_1: @ %outer_loop
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB1_2 Depth 2
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:  .LBB1_2: @ %inner_loop
+; CHECK-NEXT:    @ Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    strh r2, [r0], #2
+; CHECK-NEXT:    le lr, .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-NEXT:    @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    adds r3, #1
+; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    b .LBB1_1
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+  br label %inner_loop
+
+inner_loop:
+  %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+  %add = add i32 %idx_inner, %n
+  %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+  store i16 0, ptr %gep, align 2
+  %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+  %cond_inner = icmp eq i32 %idx_inner.inc, %n
+  br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+  %idx_outer.inc = add nuw i32 %idx_outer, 1
+  %cond_outer = icmp eq i32 %idx_outer.inc, %n
+  br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+  ret void
+}
+
+; Check that when we can't use LE we don't discount the cost of a sub
+; instruction, so we only get it when postincrement is the preferred addressing
+; mode (i.e. when we have mve).
+
+declare void @otherfn()
+
+define void @test_no_le(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test_no_le:
+; CHECK-NOMVE:       @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT:    add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT:    mov r4, r1
+; CHECK-NOMVE-NEXT:    movs r6, #0
+; CHECK-NOMVE-NEXT:    mov.w r8, #0
+; CHECK-NOMVE-NEXT:  .LBB2_1: @ %outer_loop
+; CHECK-NOMVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT:    @ Child Loop BB2_2 Depth 2
+; CHECK-NOMVE-NEXT:    movs r7, #0
+; CHECK-NOMVE-NEXT:  .LBB2_2: @ %inner_loop
+; CHECK-NOMVE-NEXT:    @ Parent Loop BB2_1 Depth=1
+; CHECK-NOMVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT:    bl otherfn
+; CHECK-NOMVE-NEXT:    strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT:    adds r7, #1
+; CHECK-NOMVE-NEXT:    cmp r4, r7
+; CHECK-NOMVE-NEXT:    bne .LBB2_2
+; CHECK-NOMVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT:    @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NOMVE-NEXT:    add.w r8, r8, #1
+; CHECK-NOMVE-NEXT:    cmp r8, r4
+; CHECK-NOMVE-NEXT:    bne .LBB2_1
+; CHECK-NOMVE-NEXT:  @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT:    sub sp, #4
+; CHECK-MVE-NEXT:    add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT:    mov r9, r1
+; CHECK-MVE-NEXT:    movs r6, #0
+; CHECK-MVE-NEXT:    movs r7, #0
+; CHECK-MVE-NEXT:  .LBB2_1: @ %outer_loop
+; CHECK-MVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT:    @ Child Loop BB2_2 Depth 2
+; CHECK-MVE-NEXT:    mov r5, r8
+; CHECK-MVE-NEXT:    mov r4, r9
+; CHECK-MVE-NEXT:  .LBB2_2: @ %inner_loop
+; CHECK-MVE-NEXT:    @ Parent Loop BB2_1 Depth=1
+; CHECK-MVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT:    bl otherfn
+; CHECK-MVE-NEXT:    strh r6, [r5], #2
+; CHECK-MVE-NEXT:    subs r4, #1
+; CHECK-MVE-NEXT:    bne .LBB2_2
+; CHECK-MVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT:    @ in Loop: Header=BB2_1 Depth=1
+; CHECK-MVE-NEXT:    adds r7, #1
+; CHECK-MVE-NEXT:    cmp r7, r9
+; CHECK-MVE-NEXT:    bne .LBB2_1
+; CHECK-MVE-NEXT:  @ %bb.4: @ %exit
+; CHECK-MVE-NEXT:    add sp, #4
+; CHECK-MVE-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+  br label %inner_loop
+
+inner_loop:
+  %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+  call void @otherfn()
+  %add = add i32 %idx_inner, %n
+  %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+  store i16 0, ptr %gep, align 2
+  %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+  %cond_inner = icmp eq i32 %idx_inner.inc, %n
+  br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+  %idx_outer.inc = add nuw i32 %idx_outer, 1
+  %cond_outer = icmp eq i32 %idx_outer.inc, %n
+  br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+  ret void
+}
+
+define void @test_no_le_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-NOMVE-LABEL: test_no_le_optsize:
+; CHECK-NOMVE:       @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT:    add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT:    mov r4, r1
+; CHECK-NOMVE-NEXT:    movs r6, #0
+; CHECK-NOMVE-NEXT:    mov.w r8, #0
+; CHECK-NOMVE-NEXT:  .LBB3_1: @ %outer_loop
+; CHECK-NOMVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT:    @ Child Loop BB3_2 Depth 2
+; CHECK-NOMVE-NEXT:    movs r7, #0
+; CHECK-NOMVE-NEXT:  .LBB3_2: @ %inner_loop
+; CHECK-NOMVE-NEXT:    @ Parent Loop BB3_1 Depth=1
+; CHECK-NOMVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT:    bl otherfn
+; CHECK-NOMVE-NEXT:    strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT:    adds r7, #1
+; CHECK-NOMVE-NEXT:    cmp r4, r7
+; CHECK-NOMVE-NEXT:    bne .LBB3_2
+; CHECK-NOMVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT:    @ in Loop: Header=BB3_1 Depth=1
+; CHECK-NOMVE-NEXT:    add.w r8, r8, #1
+; CHECK-NOMVE-NEXT:    cmp r8, r4
+; CHECK-NOMVE-NEXT:    bne .LBB3_1
+; CHECK-NOMVE-NEXT:  @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le_optsize:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT:    sub sp, #4
+; CHECK-MVE-NEXT:    add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT:    mov r9, r1
+; CHECK-MVE-NEXT:    movs r6, #0
+; CHECK-MVE-NEXT:    movs r7, #0
+; CHECK-MVE-NEXT:  .LBB3_1: @ %outer_loop
+; CHECK-MVE-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT:    @ Child Loop BB3_2 Depth 2
+; CHECK-MVE-NEXT:    mov r5, r8
+; CHECK-MVE-NEXT:    mov r4, r9
+; CHECK-MVE-NEXT:  .LBB3_2: @ %inner_loop
+; CHECK-MVE-NEXT:    @ Parent Loop BB3_1 Depth=1
+; CHECK-MVE-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT:    bl otherfn
+; CHECK-MVE-NEXT:    strh r6, [r5], #2
+; CHECK-MVE-NEXT:    subs r4, #1
+; CHECK-MVE-NEXT:    bne .LBB3_2
+; CHECK-MVE-NEXT:  @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT:    @ in Loop: Header=BB3_1 Depth=1
+; CHECK-MVE-NEXT:    adds r7, #1
+; CHECK-MVE-NEXT:    cmp r7, r9
+; CHECK-MVE-NEXT:    bne .LBB3_1
+; CHECK-MVE-NEXT:  @ %bb.4: @ %exit
+; CHECK-MVE-NEXT:    add sp, #4
+; CHECK-MVE-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+  br label %outer_loop
+
+outer_loop:
+  %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+  br label %inner_loop
+
+inner_loop:
+  %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+  call void @otherfn()
+  %add = add i32 %idx_inner, %n
+  %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+  store i16 0, ptr %gep, align 2
+  %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+  %cond_inner = icmp eq i32 %idx_inner.inc, %n
+  br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+  %idx_outer.inc = add nuw i32 %idx_outer, 1
+  %cond_outer = icmp eq i32 %idx_outer.inc, %n
+  br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+  ret void
+}

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 1f3a43923db61..c6158cb611a70 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@@ -8,48 +8,53 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
 ; CHECK-NEXT:  .LBB0_1: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    sub.w r9, r1, #2
+; CHECK-NEXT:    sub.w r8, r0, #2
+; CHECK-NEXT:    subs r5, r2, #2
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    lsl.w r12, r3, #1
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    mov r4, r9
 ; CHECK-NEXT:  .LBB0_2: @ %for.cond1.preheader.us
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    dls lr, r8
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    dls lr, r10
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    mov r7, r9
+; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:  .LBB0_3: @ %for.body4.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r5, [r0, r6, lsl #1]
-; CHECK-NEXT:    ldrh.w r7, [r1, r6, lsl #1]
-; CHECK-NEXT:    add r5, r7
-; CHECK-NEXT:    strh.w r5, [r4, r6, lsl #1]
-; CHECK-NEXT:    adds r6, #1
+; CHECK-NEXT:    ldrh r1, [r6, #2]!
+; CHECK-NEXT:    ldrh r3, [r7, #2]!
+; CHECK-NEXT:    add r1, r3
+; CHECK-NEXT:    strh r1, [r2, #2]!
 ; CHECK-NEXT:    le lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %for.body15.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    dls lr, r8
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    dls lr, r10
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    mov r7, r9
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:  .LBB0_5: @ %for.body15.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r7, [r0, r6, lsl #1]
-; CHECK-NEXT:    ldrh.w r5, [r1, r6, lsl #1]
-; CHECK-NEXT:    add r5, r7
-; CHECK-NEXT:    strh.w r5, [r2, r6, lsl #1]
-; CHECK-NEXT:    adds r6, #1
+; CHECK-NEXT:    ldrh r1, [r6, #2]!
+; CHECK-NEXT:    ldrh r3, [r7, #2]!
+; CHECK-NEXT:    add r1, r3
+; CHECK-NEXT:    strh r1, [r2, #2]!
 ; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond.cleanup14.us
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    adds r0, #1
+; CHECK-NEXT:    add r5, r12
 ; CHECK-NEXT:    add r4, r12
-; CHECK-NEXT:    cmp r3, r8
+; CHECK-NEXT:    cmp r0, r10
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  @ %bb.7:
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    bx lr
 entry:
   %cmp252 = icmp sgt i32 %n, 0

diff  --git a/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
index f384b989a50d6..6af012a9c9eb4 100644
--- a/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
+++ b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
@@ -4,7 +4,7 @@
 ;; records when they appear immediately after the last existing phi node.
 
 ; CHECK-LABEL: for.body:
-; CHECK-NEXT: = phi i32
+; CHECK-NEXT: = phi ptr
 ; CHECK-NEXT: = phi i32
 ; CHECK-NEXT: #dbg_value