[llvm] acbc9ae - [ARM][MVE] Fixes for tail predication.

Sam Parker via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 20 01:35:12 PST 2019


Author: Sam Parker
Date: 2019-12-20T09:34:18Z
New Revision: acbc9aed726d4b7428691e026a214cb26ee2cf94

URL: https://github.com/llvm/llvm-project/commit/acbc9aed726d4b7428691e026a214cb26ee2cf94
DIFF: https://github.com/llvm/llvm-project/commit/acbc9aed726d4b7428691e026a214cb26ee2cf94.diff

LOG: [ARM][MVE] Fixes for tail predication.

1) Fix an issue with the incorrect value being used for the number of
   elements being passed to [d|w]lstp. We were trying to check that
   the value was available at LoopStart, but this doesn't consider
   that the last instruction in the block could also define the
   register. Two helpers have been added to RDA for this.
2) Insert some code to now try to move the element count def or the
   insertion point so that we can perform more tail predication.
3) Related to (1), the same off-by-one could prevent us from
   generating a low-overhead loop when a mov lr could have been
   the last instruction in the block.
4) Fix up some instruction attributes so that not all the
   low-overhead loop instructions are labelled as branches and
   terminators - as this is not true for dls/dlstp.

Differential Revision: https://reviews.llvm.org/D71609

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir

Modified: 
    llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
    llvm/lib/CodeGen/ReachingDefAnalysis.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/lib/Target/ARM/ARMInstrThumb2.td
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index 685ba94e57aa..5a747245a62e 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -106,6 +106,15 @@ class ReachingDefAnalysis : public MachineFunctionPass {
   /// Return whether A and B use the same def of PhysReg.
   bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, int PhysReg);
 
+  /// Return whether the reaching def for MI also is live out of its parent
+  /// block.
+  bool isReachingDefLiveOut(MachineInstr *MI, int PhysReg);
+
+  /// Return the local MI that produces the live out value for PhysReg, or
+  /// nullptr for a non-live out or non-local def.
+  MachineInstr *getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                     int PhysReg);
+
   /// Return whether the given register is used after MI, whether it's a local
   /// use or a live out.
   bool isRegUsedAfter(MachineInstr *MI, int PhysReg);

diff  --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index e5b422e0b7ed..61ae3b75ab5e 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -231,15 +231,15 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
   while (++MI != MBB->end()) {
+    // If/when we find a new reaching def, we know that there's no more uses
+    // of 'Def'.
+    if (getReachingMIDef(&*MI, PhysReg) != Def)
+      return;
+
     for (auto &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
         continue;
 
-      // If/when we find a new reaching def, we know that there's no more uses
-      // of 'Def'.
-      if (getReachingMIDef(&*MI, PhysReg) != Def)
-        return;
-
       Uses.push_back(&*MI);
       if (MO.isKill())
         return;
@@ -272,6 +272,42 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
   return false;
 }
 
+bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) {
+  MachineBasicBlock *MBB = MI->getParent();
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (!LiveRegs.contains(PhysReg))
+    return false;
+
+  MachineInstr *Last = &MBB->back();
+  int Def = getReachingDef(MI, PhysReg);
+  if (getReachingDef(Last, PhysReg) != Def)
+    return false;
+
+  // Finally check that the last instruction doesn't redefine the register.
+  for (auto &MO : Last->operands())
+    if (MO.isReg() && MO.isDef() && MO.getReg() == PhysReg)
+      return false;
+
+  return true;
+}
+
+MachineInstr* ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                                        int PhysReg) {
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (!LiveRegs.contains(PhysReg))
+    return nullptr;
+
+  MachineInstr *Last = &MBB->back();
+  int Def = getReachingDef(Last, PhysReg);
+  for (auto &MO : Last->operands())
+    if (MO.isReg() && MO.isDef() && MO.getReg() == PhysReg)
+      return Last;
+
+  return Def < 0 ? nullptr : getInstFromId(MBB, Def);
+}
+
 MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI,
     int PhysReg) {
   auto I = MachineBasicBlock::reverse_iterator(MI);

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1b631b4b9e1b..894580605855 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5717,6 +5717,8 @@ class MVE_WLSTP<string asm, bits<2> size>
   let Inst{13} = 0b0;
   let Inst{11} = label{0};
   let Inst{10-1} = label{10-1};
+  let isBranch = 1;
+  let isTerminator = 1;
 }
 
 def MVE_DLSTP_8  : MVE_DLSTP<"dlstp.8",  0b00>;
@@ -5745,6 +5747,8 @@ def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout),
   let Inst{13} = 0b0;
   let Inst{11} = label{0};
   let Inst{10-1} = label{10-1};
+  let isBranch = 1;
+  let isTerminator = 1;
 }
 
 def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {

diff  --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 00921930e71a..d07d6ec119f0 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5189,8 +5189,6 @@ class t2LOL<dag oops, dag iops, string asm, string ops>
   let Inst{31-23} = 0b111100000;
   let Inst{15-14} = 0b11;
   let Inst{0} = 0b1;
-  let isBranch = 1;
-  let isTerminator = 1;
   let DecoderMethod = "DecodeLOLoop";
   let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
 }
@@ -5207,13 +5205,13 @@ def t2WLS : t2LOL<(outs GPRlr:$LR),
   let Inst{11} = label{0};
   let Inst{10-1} = label{10-1};
   let usesCustomInserter = 1;
+  let isBranch = 1;
+  let isTerminator = 1;
 }
 
 def t2DLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn),
                   "dls", "$LR, $Rn"> {
   bits<4> Rn;
-  let isBranch = 0;
-  let isTerminator = 0;
   let Inst{22-20} = 0b100;
   let Inst{19-16} = Rn{3-0};
   let Inst{13-1} = 0b1000000000000;
@@ -5229,6 +5227,8 @@ def t2LEUpdate : t2LOL<(outs GPRlr:$LRout),
   let Inst{11} = label{0};
   let Inst{10-1} = label{10-1};
   let usesCustomInserter = 1;
+  let isBranch = 1;
+  let isTerminator = 1;
 }
 
 def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
@@ -5237,6 +5237,8 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
   let Inst{13-12} = 0b00;
   let Inst{11} = label{0};
   let Inst{10-1} = label{10-1};
+  let isBranch = 1;
+  let isTerminator = 1;
 }
 
 def t2DoLoopStart :

diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index ec62a6975f00..90bed3243ec1 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -23,6 +23,7 @@
 #include "ARMBasicBlockInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
@@ -268,12 +269,15 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
   // Find an insertion point:
   // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
   //   to Count before Start, we can insert at that mov.
+  if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
+    if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+      return LRDef;
+
   // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
   //   to Count after Start, we can insert at that mov.
-  if (auto *LRDef = RDA->getReachingMIDef(&MBB->back(), ARM::LR)) {
+  if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
       return LRDef;
-  }
 
   // We've found no suitable LR def and Start doesn't use LR directly. Can we
   // just define LR anyway?
@@ -283,6 +287,32 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
   return nullptr;
 }
 
+// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
+// not define a register that is used by any instructions, after and including,
+// 'To'. These instructions also must not redefine any of Froms operands.
+template<typename Iterator>
+static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
+  SmallSet<int, 2> Defs;
+  // First check that From would compute the same value if moved.
+  for (auto &MO : From->operands()) {
+    if (!MO.isReg() || MO.isUndef() || !MO.getReg())
+      continue;
+    if (MO.isDef())
+      Defs.insert(MO.getReg());
+    else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
+      return false;
+  }
+
+  // Now walk checking that the rest of the instructions will compute the same
+  // value.
+  for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
+    for (auto &MO : I->operands())
+      if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
+        return false;
+  }
+  return true;
+}
+
 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
                                     ReachingDefAnalysis *RDA,
                                     MachineLoopInfo *MLI) {
@@ -369,13 +399,26 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
     return;
   }
 
-  // We can't perform TP if the register does not hold the same value at
-  // InsertPt as the liveout value.
+  // The element count register maybe defined after InsertPt, in which case we
+  // need to try to move either InsertPt or the def so that the [w|d]lstp can
+  // use the value.
   MachineBasicBlock *InsertBB = InsertPt->getParent();
-  if  (!RDA->hasSameReachingDef(InsertPt, &InsertBB->back(),
-                                NumElements)) {
-    CannotTailPredicate = true;
-    return;
+  if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
+    if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
+      if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
+        ElemDef->removeFromParent();
+        InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
+        LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
+                   << *ElemDef);
+      } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
+        InsertPt->removeFromParent();
+        InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
+        LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
+      } else {
+        CannotTailPredicate = true;
+        return;
+      }
+    }
   }
 
   // Especially in the case of while loops, InsertBB may not be the

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
new file mode 100644
index 000000000000..57fe0492f1eb
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
@@ -0,0 +1,175 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+--- |
+  define dso_local arm_aapcs_vfpcc void @start_before_elems(i32* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %div = lshr i32 %N, 1
+    %cmp9 = icmp eq i32 %div, 0
+    %0 = add nuw i32 %div, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %scevgep4 = getelementptr i8, i8* %b, i32 %index
+    %scevgep45 = bitcast i8* %scevgep4 to <4 x i8>*
+    %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep45, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %10 = zext <4 x i8> %wide.masked.load to <4 x i32>
+    %scevgep2 = getelementptr i8, i8* %c, i32 %index
+    %scevgep23 = bitcast i8* %scevgep2 to <4 x i8>*
+    %wide.masked.load13 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep23, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %11 = zext <4 x i8> %wide.masked.load13 to <4 x i32>
+    %12 = mul nuw nsw <4 x i32> %11, %10
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %12, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %8)
+    %index.next = add i32 %index, 4
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+    %14 = icmp ne i32 %13, 0
+    br i1 %14, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    ret void
+  }
+  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+...
+---
+name:            start_before_elems
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: start_before_elems
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+  ; CHECK:   t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 8, implicit-def $itstate
+  ; CHECK:   tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+    t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+    t2IT 0, 8, implicit-def $itstate
+    tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    renamable $r12 = t2MOVi 3, 14, $noreg, $noreg
+    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+    renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg
+    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    renamable $r5 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+    renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+    t2DoLoopStart renamable $r5
+    $lr = tMOVr killed $r5, 14, $noreg
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r12
+
+    renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+    renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+    renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.for.cond.cleanup:
+    tPOP_RET 14, $noreg, def $r4, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
new file mode 100644
index 000000000000..ff49bb0770ee
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -0,0 +1,183 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+# Test that, though the vctp operand is defined at the end of the block,
+# that the correct value is used for the dlstp.
+# TODO: The pass currently just bails instead of finding the correct
+# value.
+
+--- |
+  define dso_local arm_aapcs_vfpcc void @start_before_elems(i32* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %div = lshr i32 %N, 1
+    %cmp9 = icmp eq i32 %div, 0
+    %0 = add nuw i32 %div, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %scevgep4 = getelementptr i8, i8* %b, i32 %index
+    %scevgep45 = bitcast i8* %scevgep4 to <4 x i8>*
+    %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep45, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %10 = zext <4 x i8> %wide.masked.load to <4 x i32>
+    %scevgep2 = getelementptr i8, i8* %c, i32 %index
+    %scevgep23 = bitcast i8* %scevgep2 to <4 x i8>*
+    %wide.masked.load13 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep23, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %11 = zext <4 x i8> %wide.masked.load13 to <4 x i32>
+    %12 = mul nuw nsw <4 x i32> %11, %10
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %12, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %8)
+    %index.next = add i32 %index, 4
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+    %14 = icmp ne i32 %13, 0
+    br i1 %14, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    ret void
+  }
+  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+
+...
+---
+name:            start_before_elems
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: start_before_elems
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+  ; CHECK:   t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 8, implicit-def $itstate
+  ; CHECK:   tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+    t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+    t2IT 0, 8, implicit-def $itstate
+    tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    renamable $r12 = t2MOVi 3, 14, $noreg, $noreg
+    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+    renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg
+    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+    $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
+    t2DoLoopStart renamable $lr
+    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+    renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r12
+
+    renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+    renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+    renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.for.cond.cleanup:
+    tPOP_RET 14, $noreg, def $r4, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
new file mode 100644
index 000000000000..3fb203ee1938
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -0,0 +1,181 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+# Test that, though the vctp operand is defined at the end of the block,
+# that the correct value is used for the dlstp.
+
+--- |
+  define dso_local arm_aapcs_vfpcc void @start_before_elems(i32* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %div = lshr i32 %N, 1
+    %cmp9 = icmp eq i32 %div, 0
+    %0 = add nuw i32 %div, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %scevgep4 = getelementptr i8, i8* %b, i32 %index
+    %scevgep45 = bitcast i8* %scevgep4 to <4 x i8>*
+    %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep45, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %10 = zext <4 x i8> %wide.masked.load to <4 x i32>
+    %scevgep2 = getelementptr i8, i8* %c, i32 %index
+    %scevgep23 = bitcast i8* %scevgep2 to <4 x i8>*
+    %wide.masked.load13 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %scevgep23, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %11 = zext <4 x i8> %wide.masked.load13 to <4 x i32>
+    %12 = mul nuw nsw <4 x i32> %11, %10
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %12, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %8)
+    %index.next = add i32 %index, 4
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+    %14 = icmp ne i32 %13, 0
+    br i1 %14, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    ret void
+  }
+  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+
+...
+---
+name:            start_before_elems
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: start_before_elems
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+  ; CHECK:   t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 8, implicit-def $itstate
+  ; CHECK:   tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+  ; CHECK:   $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+  ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
+    t2CMPrs killed renamable $r12, renamable $r3, 11, 14, $noreg, implicit-def $cpsr
+    t2IT 0, 8, implicit-def $itstate
+    tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+    renamable $r12 = t2MOVi 3, 14, $noreg, $noreg
+    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+    renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14, $noreg, $noreg
+    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+    t2DoLoopStart renamable $lr
+    $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
+    renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3, $r12
+
+    renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
+    renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+    renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+    renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.for.cond.cleanup:
+    tPOP_RET 14, $noreg, def $r4, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
index c052e22d217d..5f4a10249684 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@@ -142,6 +142,7 @@ body:             |
     t2IT 2, 8, implicit-def $itstate
     renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
     early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+    renamable $lr = tMOVr $lr, 14, $noreg
     t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr
     tB %bb.2, 14, $noreg
 


        


More information about the llvm-commits mailing list