[llvm] 58f3201 - [ARM] Updates to arm-block-placement pass

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 12 06:46:34 PDT 2021


Author: Malhar Jajoo
Date: 2021-04-12T14:46:23+01:00
New Revision: 58f3201a20f7c51393ef4509b69515cb9c4b32bf

URL: https://github.com/llvm/llvm-project/commit/58f3201a20f7c51393ef4509b69515cb9c4b32bf
DIFF: https://github.com/llvm/llvm-project/commit/58f3201a20f7c51393ef4509b69515cb9c4b32bf.diff

LOG: [ARM] Updates to arm-block-placement pass

The patch makes two updates to the arm-block-placement pass:
- Handle arbitrarily nested loops
- Extends the search (for t2WhileLoopStartLR) to the predecessor of the
  preHeader.

Differential Revision: https://reviews.llvm.org/D99649

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBlockPlacement.cpp
    llvm/test/CodeGen/Thumb2/block-placement.mir
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index 155a6cb18d85d..57914b3b3d618 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -38,6 +38,8 @@ class ARMBlockPlacement : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
   void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After);
   bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
+  bool fixBackwardsWLS(MachineLoop *ML);
+  bool processPostOrderLoops(MachineLoop *ML);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -57,9 +59,135 @@ char ARMBlockPlacement::ID = 0;
 INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
                 false)
 
+static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) {
+  for (auto &Terminator : MBB->terminators()) {
+    if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR)
+      return &Terminator;
+  }
+  return nullptr;
+}
+
+/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only
+/// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair.
+static MachineInstr *findWLS(MachineLoop *ML) {
+  MachineBasicBlock *Predecessor = ML->getLoopPredecessor();
+  if (!Predecessor)
+    return nullptr;
+  MachineInstr *WlsInstr = findWLSInBlock(Predecessor);
+  if (WlsInstr)
+    return WlsInstr;
+  if (Predecessor->pred_size() == 1)
+    return findWLSInBlock(*Predecessor->pred_begin());
+  return nullptr;
+}
+
+/// Checks if loop has a backwards branching WLS, and if possible, fixes it.
+/// This requires checking the preheader (or it's predecessor) for a WLS and if
+/// its target is before it.
+/// If moving the target block wouldn't produce another backwards WLS or a new
+/// forwards LE branch, then move the target block after the preheader (or it's
+/// predecessor).
+bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
+  MachineInstr *WlsInstr = findWLS(ML);
+  if (!WlsInstr)
+    return false;
+
+  MachineBasicBlock *Predecessor = WlsInstr->getParent();
+  MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB();
+  // We don't want to move the function's entry block.
+  if (!LoopExit->getPrevNode())
+    return false;
+  if (blockIsBefore(Predecessor, LoopExit))
+    return false;
+  LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
+                    << Predecessor->getFullName() << " to "
+                    << LoopExit->getFullName() << "\n");
+
+  // Make sure that moving the target block doesn't cause any of its WLSs
+  // that were previously not backwards to become backwards
+  bool CanMove = true;
+  MachineInstr *WlsInLoopExit = findWLSInBlock(LoopExit);
+  if (WlsInLoopExit) {
+    // An example loop structure where the LoopExit can't be moved, since
+    // bb1's WLS will become backwards once it's moved after bb3
+    // bb1:          - LoopExit
+    //      WLS bb2
+    // bb2:          - LoopExit2
+    //      ...
+    // bb3:          - Predecessor
+    //      WLS bb1
+    // bb4:          - Header
+    MachineBasicBlock *LoopExit2 = WlsInLoopExit->getOperand(2).getMBB();
+    // If the WLS from LoopExit to LoopExit2 is already backwards then
+    // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
+    // after the Predecessor then moving will keep it as a forward branch, so it
+    // can be moved. If LoopExit2 is between the Predecessor and LoopExit then
+    // moving LoopExit will make it a backwards branch, so it can't be moved
+    // since we'd fix one and introduce one backwards branch.
+    // TODO: Analyse the blocks to make a decision if it would be worth
+    // moving LoopExit even if LoopExit2 is between the Predecessor and
+    // LoopExit.
+    if (!blockIsBefore(LoopExit2, LoopExit) &&
+        (LoopExit2 == Predecessor || blockIsBefore(LoopExit2, Predecessor))) {
+      LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+                        << "Can't move the target block as it would "
+                           "introduce a new backwards WLS branch\n");
+      CanMove = false;
+    }
+  }
+
+  if (CanMove) {
+    // Make sure no LEs become forwards.
+    // An example loop structure where the LoopExit can't be moved, since
+    // bb2's LE will become forwards once bb1 is moved after bb3.
+    // bb1:           - LoopExit
+    // bb2:
+    //      LE  bb1  - Terminator
+    // bb3:          - Predecessor
+    //      WLS bb1
+    // bb4:          - Header
+    for (auto It = LoopExit->getIterator(); It != Predecessor->getIterator();
+         It++) {
+      MachineBasicBlock *MBB = &*It;
+      for (auto &Terminator : MBB->terminators()) {
+        if (Terminator.getOpcode() != ARM::t2LoopEnd &&
+            Terminator.getOpcode() != ARM::t2LoopEndDec)
+          continue;
+        MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
+        // The LE will become forwards branching if it branches to LoopExit
+        // which isn't allowed by the architecture, so we should avoid
+        // introducing these.
+        // TODO: Analyse the blocks to make a decision if it would be worth
+        // moving LoopExit even if we'd introduce a forwards LE
+        if (LETarget == LoopExit) {
+          LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+                            << "Can't move the target block as it would "
+                               "introduce a new forwards LE branch\n");
+          CanMove = false;
+          break;
+        }
+      }
+    }
+  }
+
+  if (CanMove)
+    moveBasicBlock(LoopExit, Predecessor);
+
+  return CanMove;
+}
+
+/// Updates ordering (of WLS BB and their loopExits) in inner loops first
+/// Returns true if any change was made in any of the loops
+bool ARMBlockPlacement::processPostOrderLoops(MachineLoop *ML) {
+  bool Changed = false;
+  for (auto *InnerML : *ML)
+    Changed |= processPostOrderLoops(InnerML);
+  return Changed | fixBackwardsWLS(ML);
+}
+
 bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
-      return false;
+    return false;
   const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
   if (!ST.hasLOB())
     return false;
@@ -72,109 +200,9 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   BBUtils->adjustBBOffsetsAfter(&MF.front());
   bool Changed = false;
 
-  // Find loops with a backwards branching WLS.
-  // This requires looping over the loops in the function, checking each
-  // preheader for a WLS and if its target is before the preheader. If moving
-  // the target block wouldn't produce another backwards WLS or a new forwards
-  // LE branch then move the target block after the preheader.
-  for (auto *ML : *MLI) {
-    MachineBasicBlock *Preheader = ML->getLoopPredecessor();
-    if (!Preheader)
-      continue;
-
-    for (auto &Terminator : Preheader->terminators()) {
-      if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR)
-        continue;
-      MachineBasicBlock *LoopExit = Terminator.getOperand(2).getMBB();
-      // We don't want to move the function's entry block.
-      if (!LoopExit->getPrevNode())
-        continue;
-      if (blockIsBefore(Preheader, LoopExit))
-        continue;
-      LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
-                        << Preheader->getFullName() << " to "
-                        << LoopExit->getFullName() << "\n");
-
-      // Make sure that moving the target block doesn't cause any of its WLSs
-      // that were previously not backwards to become backwards
-      bool CanMove = true;
-      for (auto &LoopExitTerminator : LoopExit->terminators()) {
-        if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStartLR)
-          continue;
-        // An example loop structure where the LoopExit can't be moved, since
-        // bb1's WLS will become backwards once it's moved after bb3 bb1: -
-        // LoopExit
-        //      WLS bb2  - LoopExit2
-        // bb2:
-        //      ...
-        // bb3:          - Preheader
-        //      WLS bb1
-        // bb4:          - Header
-        MachineBasicBlock *LoopExit2 =
-            LoopExitTerminator.getOperand(2).getMBB();
-        // If the WLS from LoopExit to LoopExit2 is already backwards then
-        // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
-        // after the Preheader then moving will keep it as a forward branch, so
-        // it can be moved. If LoopExit2 is between the Preheader and LoopExit
-        // then moving LoopExit will make it a backwards branch, so it can't be
-        // moved since we'd fix one and introduce one backwards branch.
-        // TODO: Analyse the blocks to make a decision if it would be worth
-        // moving LoopExit even if LoopExit2 is between the Preheader and
-        // LoopExit.
-        if (!blockIsBefore(LoopExit2, LoopExit) &&
-            (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) {
-          LLVM_DEBUG(dbgs() << DEBUG_PREFIX
-                            << "Can't move the target block as it would "
-                               "introduce a new backwards WLS branch\n");
-          CanMove = false;
-          break;
-        }
-      }
-
-      if (CanMove) {
-        // Make sure no LEs become forwards.
-        // An example loop structure where the LoopExit can't be moved, since
-        // bb2's LE will become forwards once bb1 is moved after bb3.
-        // bb1:           - LoopExit
-        // bb2:
-        //      LE  bb1  - Terminator
-        // bb3:          - Preheader
-        //      WLS bb1
-        // bb4:          - Header
-        for (auto It = LoopExit->getIterator(); It != Preheader->getIterator();
-             It++) {
-          MachineBasicBlock *MBB = &*It;
-          for (auto &Terminator : MBB->terminators()) {
-            if (Terminator.getOpcode() != ARM::t2LoopEnd &&
-                Terminator.getOpcode() != ARM::t2LoopEndDec)
-              continue;
-            MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
-            // The LE will become forwards branching if it branches to LoopExit
-            // which isn't allowed by the architecture, so we should avoid
-            // introducing these.
-            // TODO: Analyse the blocks to make a decision if it would be worth
-            // moving LoopExit even if we'd introduce a forwards LE
-            if (LETarget == LoopExit) {
-              LLVM_DEBUG(dbgs() << DEBUG_PREFIX
-                                << "Can't move the target block as it would "
-                                   "introduce a new forwards LE branch\n");
-              CanMove = false;
-              break;
-            }
-          }
-        }
-
-        if (!CanMove)
-          break;
-      }
-
-      if (CanMove) {
-        moveBasicBlock(LoopExit, Preheader);
-        Changed = true;
-        break;
-      }
-    }
-  }
+  // Find loops with a backwards branching WLS and fix if possible.
+  for (auto *ML : *MLI)
+    Changed |= processPostOrderLoops(ML);
 
   return Changed;
 }
@@ -184,6 +212,8 @@ bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB,
   return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB);
 }
 
+/// Moves a given MBB to be positioned after another MBB while maintaining
+/// existing control flow
 void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
                                        MachineBasicBlock *After) {
   LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after "
@@ -195,6 +225,9 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
 
   BB->moveAfter(After);
 
+  // Since only the blocks are to be moved around (but the control flow must
+  // not change), if there were any fall-throughs (to/from adjacent blocks),
+  // replace with unconditional branch to the fall through block.
   auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) {
     LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from "
                       << From->getName() << " to " << To->getName() << "\n");

diff  --git a/llvm/test/CodeGen/Thumb2/block-placement.mir b/llvm/test/CodeGen/Thumb2/block-placement.mir
index 855895b45ee63..9f40817e0c423 100644
--- a/llvm/test/CodeGen/Thumb2/block-placement.mir
+++ b/llvm/test/CodeGen/Thumb2/block-placement.mir
@@ -1,16 +1,19 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -run-pass=arm-block-placement %s -o - | FileCheck %s
 --- |
+  ; Checks that loopExitBlock gets moved (in forward direction) if there is a backwards WLS to it.
   define void @backwards_branch(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
   entry:
     unreachable
   }
 
+  ; Checks that loopExitBlock does not get reordered (since it is entry block) even if there is a backwards WLS to it.
   define void @backwards_branch_entry_block(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
   entry:
     unreachable
   }
 
+  ; Checks that loopExitBlock (containing a backwards WLS) is moved (in forward direction) if there is a backwards WLS to it.
   define void @backwards_branch_target_already_backwards(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
   entry:
     unreachable
@@ -21,16 +24,25 @@
     unreachable
   }
 
+  ; Checks that loopExitBlock (to which a backwards LE exists) is not moved if moving it would cause the LE to become forwards branching.
   define void @backwards_branch_forwards_le(i32 %N, i32 %M, i32* nocapture %a, i32* nocapture %b, i32* nocapture %c) local_unnamed_addr #0 {
   entry:
     unreachable
   }
 
+  ; Checks that a MachineFunction is unaffected if it doesn't contain any WLS (pseudo) instruction.
   define void @no_preheader(i32 %N, i32 %M, i32* nocapture %a, i32* nocapture %b, i32* nocapture %c) local_unnamed_addr #0 {
   entry:
     unreachable
   }
 
+  ; Within a nested loop, checks that loopExit gets moved (in forward direction) if there exists a backwards WLS to it.
+  ; Both the WLS and loopExit are at depth=3.
+  define void @nested_loops(i32 %n, i32 %m, i32 %l, i8* noalias %X, i8* noalias %Y) local_unnamed_addr #0 {
+  entry:
+    unreachable
+  }
+
   declare dso_local i32 @g(...) local_unnamed_addr #1
 
   declare dso_local i32 @h(...) local_unnamed_addr #1
@@ -441,3 +453,188 @@ body:             |
   bb.5:
     frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc
 ...
+---
+name:            nested_loops
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0' }
+  - { reg: '$r1' }
+  - { reg: '$r2' }
+  - { reg: '$r3' }
+frameInfo:
+  stackSize:       32
+  maxAlignment:    4
+  maxCallFrameSize: 0
+fixedStack:
+  - { id: 0, size: 4, alignment: 8, isImmutable: true }
+stack:
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '$lr',
+      callee-saved-restored: false }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '$r10' }
+  - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, callee-saved-register: '$r9' }
+  - { id: 3, type: spill-slot, offset: -16, size: 4, alignment: 4, callee-saved-register: '$r8' }
+  - { id: 4, type: spill-slot, offset: -20, size: 4, alignment: 4, callee-saved-register: '$r7' }
+  - { id: 5, type: spill-slot, offset: -24, size: 4, alignment: 4, callee-saved-register: '$r6' }
+  - { id: 6, type: spill-slot, offset: -28, size: 4, alignment: 4, callee-saved-register: '$r5' }
+  - { id: 7, type: spill-slot, offset: -32, size: 4, alignment: 4, callee-saved-register: '$r4' }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: nested_loops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr
+  ; CHECK:   $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 32
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r10, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r9, -12
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r8, -16
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -20
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r6, -24
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r5, -28
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -32
+  ; CHECK:   tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   $sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+  ; CHECK:   $r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   t2B %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK: bb.2:
+  ; CHECK:   successors: %bb.9(0x04000000), %bb.3(0x7c000000)
+  ; CHECK:   liveins: $r0, $r1, $r3, $r8, $r9, $r12
+  ; CHECK:   renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg
+  ; CHECK:   tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr
+  ; CHECK: bb.3:
+  ; CHECK:   successors: %bb.4(0x50000000), %bb.2(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r3, $r8, $r9, $r12
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.2, 11 /* CC::lt */, killed $cpsr
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.6(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r3, $r8, $r9, $r12
+  ; CHECK:   renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
+  ; CHECK:   $r10 = tMOVr $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   $r2 = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK:   t2B %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK: bb.6:
+  ; CHECK:   successors: %bb.7(0x50000000), %bb.5(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+  ; CHECK:   renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.5, implicit-def dead $cpsr
+  ; CHECK:   tB %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK: bb.5:
+  ; CHECK:   successors: %bb.2(0x04000000), %bb.6(0x7c000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+  ; CHECK:   renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg
+  ; CHECK:   tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   t2Bcc %bb.2, 0 /* CC::eq */, killed $cpsr
+  ; CHECK:   tB %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK: bb.7:
+  ; CHECK:   successors: %bb.8(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+  ; CHECK:   $r5 = tMOVr $r10, 14 /* CC::al */, $noreg
+  ; CHECK:   $r6 = tMOVr $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   t2B %bb.8, 14 /* CC::al */, $noreg
+  ; CHECK: bb.8:
+  ; CHECK:   successors: %bb.8(0x7c000000), %bb.5(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12
+  ; CHECK:   tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = t2LoopEndDec killed renamable $lr, %bb.8, implicit-def dead $cpsr
+  ; CHECK:   t2B %bb.5, 14 /* CC::al */, $noreg
+  ; CHECK: bb.9:
+  ; CHECK:   $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc
+  bb.0:
+    successors: %bb.1
+    liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr
+
+    $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr
+    frame-setup CFI_INSTRUCTION def_cfa_offset 32
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r10, -8
+    frame-setup CFI_INSTRUCTION offset $r9, -12
+    frame-setup CFI_INSTRUCTION offset $r8, -16
+    frame-setup CFI_INSTRUCTION offset $r7, -20
+    frame-setup CFI_INSTRUCTION offset $r6, -24
+    frame-setup CFI_INSTRUCTION offset $r5, -28
+    frame-setup CFI_INSTRUCTION offset $r4, -32
+    tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    $sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate
+
+  bb.1:
+    liveins: $r0, $r1, $r2, $r3
+
+    renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
+    $r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg
+    renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    t2B %bb.2, 14 /* CC::al */, $noreg
+
+  bb.8:
+    successors: %bb.9(0x04000000), %bb.2(0x7c000000)
+    liveins: $r0, $r1, $r3, $r8, $r9, $r12
+
+    renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg
+    tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+    t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr
+
+  bb.2:
+    successors: %bb.3(0x50000000), %bb.8(0x30000000)
+    liveins: $r0, $r1, $r3, $r8, $r9, $r12
+
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.8, 11 /* CC::lt */, killed $cpsr
+
+  bb.3:
+    liveins: $r0, $r1, $r3, $r8, $r9, $r12
+
+    renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
+    $r10 = tMOVr $r12, 14 /* CC::al */, $noreg
+    $r2 = tMOVr $r3, 14 /* CC::al */, $noreg
+    t2B %bb.4, 14 /* CC::al */, $noreg
+
+  bb.7:
+    successors: %bb.8(0x04000000), %bb.4(0x7c000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+
+    renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg
+    renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg
+    tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
+    t2Bcc %bb.8, 0 /* CC::eq */, killed $cpsr
+
+  bb.4:
+    successors: %bb.5(0x50000000), %bb.7(0x30000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+
+    renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.7, implicit-def dead $cpsr
+
+  bb.5:
+    liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
+
+    $r5 = tMOVr $r10, 14 /* CC::al */, $noreg
+    $r6 = tMOVr $r2, 14 /* CC::al */, $noreg
+    t2B %bb.6, 14 /* CC::al */, $noreg
+
+  bb.6:
+    successors: %bb.6(0x7c000000), %bb.7(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12
+
+    tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopEndDec killed renamable $lr, %bb.6, implicit-def dead $cpsr
+    t2B %bb.7, 14 /* CC::al */, $noreg
+
+  bb.9:
+    $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index dd8c4f110691d..7f5acd19e0b35 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1077,18 +1077,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_3: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r12, r12, #1
-; CHECK-NEXT:    vstrb.8 q0, [r2], #8
-; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
-; CHECK-NEXT:    add.w r5, r0, #8
-; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_3: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_5 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
@@ -1125,14 +1117,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
-; CHECK-NEXT:    blo .LBB16_7
-; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    blo .LBB16_6
+; CHECK-NEXT:  @ %bb.4: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_6: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_5: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
@@ -1163,33 +1155,39 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_6
-; CHECK-NEXT:    b .LBB16_8
-; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_5
+; CHECK-NEXT:    b .LBB16_7
+; CHECK-NEXT:  .LBB16_6: @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_7: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    subs.w lr, r0, #0
-; CHECK-NEXT:    beq.w .LBB16_3
+; CHECK-NEXT:    wls lr, r0, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_8: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #8
+; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
+; CHECK-NEXT:    add.w r5, r0, #8
+; CHECK-NEXT:    beq .LBB16_12
+; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB16_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r4, [r6], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    bne .LBB16_10
-; CHECK-NEXT:    b .LBB16_11
-; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_10
+; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
-; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:    b .LBB16_8
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 2fd717bf2d478..19a710974548d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1071,18 +1071,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_3: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r12, r12, #1
-; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
-; CHECK-NEXT:    add.w r4, r0, #16
-; CHECK-NEXT:    beq .LBB16_12
-; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_3: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_5 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -1109,14 +1101,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
-; CHECK-NEXT:    blo .LBB16_7
-; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    blo .LBB16_6
+; CHECK-NEXT:  @ %bb.4: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_6: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_5: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
@@ -1137,34 +1129,40 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
-; CHECK-NEXT:    le lr, .LBB16_6
-; CHECK-NEXT:    b .LBB16_8
-; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_5
+; CHECK-NEXT:    b .LBB16_7
+; CHECK-NEXT:  .LBB16_6: @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_7: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
-; CHECK-NEXT:    subs.w lr, r0, #0
-; CHECK-NEXT:    beq .LBB16_3
+; CHECK-NEXT:    wls lr, r0, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_8: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
+; CHECK-NEXT:    add.w r4, r0, #16
+; CHECK-NEXT:    beq .LBB16_12
+; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    mov r3, r4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB16_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    bne .LBB16_10
-; CHECK-NEXT:    b .LBB16_11
-; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_10
+; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_3 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
-; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:    b .LBB16_8
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}


        


More information about the llvm-commits mailing list