[llvm] 835251f - [Target][ARM] Make Low Overhead Loops coexist with VPT blocks.

via llvm-commits llvm-commits at lists.llvm.org
Wed May 20 04:25:21 PDT 2020


Author: Pierre-vh
Date: 2020-05-20T12:24:55+01:00
New Revision: 835251f7d99a6ff8d396fa019dcddb87a958b15f

URL: https://github.com/llvm/llvm-project/commit/835251f7d99a6ff8d396fa019dcddb87a958b15f
DIFF: https://github.com/llvm/llvm-project/commit/835251f7d99a6ff8d396fa019dcddb87a958b15f.diff

LOG: [Target][ARM] Make Low Overhead Loops coexist with VPT blocks.

Previously, the LowOverheadLoops pass couldn't handle VPT blocks
with conditions, or with multiple VCTPs. This patch improves the
LowOverheadLoops pass so it can handle those cases.

It also adds support for VCMPs before the VCTP.

Differential Revision: https://reviews.llvm.org/D78206

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 71da119cd542..bb57fa5d0ea8 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -35,6 +35,20 @@
 /// are defined to be as large as this maximum sequence of replacement
 /// instructions.
 ///
+/// A note on VPR.P0 (the lane mask):
+/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
+/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
+/// They will simply "and" the result of their calculation with the current
+/// value of VPR.P0. You can think of it like this:
+/// \verbatim
+/// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
+///   VPR.P0 &= Value
+/// else
+///   VPR.P0 = Value
+/// \endverbatim
+/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
+/// fall in the "VPT active" case, so we can consider that all VPR writes by
+/// one of those instruction is actually a "and".
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
@@ -123,27 +137,31 @@ namespace {
     SetVector<MachineInstr*> Predicates;
 
   public:
-    PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
-      MI(I) { Predicates.insert(Preds.begin(), Preds.end()); }
+    PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
+      assert(I && "Instruction must not be null!");
+      Predicates.insert(Preds.begin(), Preds.end());
+    }
   };
 
-  // Represent a VPT block, a list of instructions that begins with a VPST and
-  // has a maximum of four proceeding instructions. All instructions within the
-  // block are predicated upon the vpr and we allow instructions to define the
-  // vpr within in the block too.
+  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
+  // and has a maximum of four proceeding instructions. All instructions within
+  // the block are predicated upon the vpr and we allow instructions to define
+  // the vpr within in the block too.
   class VPTBlock {
-    std::unique_ptr<PredicatedMI> VPST;
+    // The predicate then instruction, which is either a VPT, or a VPST
+    // instruction.
+    std::unique_ptr<PredicatedMI> PredicateThen;
     PredicatedMI *Divergent = nullptr;
     SmallVector<PredicatedMI, 4> Insts;
 
   public:
     VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      VPST = std::make_unique<PredicatedMI>(MI, Preds);
+      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
     }
 
     void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_
diff erence(Preds, VPST->Predicates).empty()) {
+      if (!Divergent && !set_
diff erence(Preds, PredicateThen->Predicates).empty()) {
         Divergent = &Insts.back();
         LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
       }
@@ -160,18 +178,26 @@ namespace {
     // Is the given instruction part of the predicate set controlling the entry
     // to the block.
     bool IsPredicatedOn(MachineInstr *MI) const {
-      return VPST->Predicates.count(MI);
+      return PredicateThen->Predicates.count(MI);
+    }
+
+    // Returns true if this is a VPT instruction.
+    bool isVPT() const { return !isVPST(); }
+
+    // Returns true if this is a VPST instruction.
+    bool isVPST() const {
+      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
     }
 
     // Is the given instruction the only predicate which controls the entry to
     // the block.
     bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
+      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
     }
 
     unsigned size() const { return Insts.size(); }
     SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getVPST() const { return VPST->MI; }
+    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
     PredicatedMI *getDivergent() const { return Divergent; }
   };
 
@@ -187,6 +213,7 @@ namespace {
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineInstr *VCTP = nullptr;
+    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
     VPTBlock *CurrentBlock = nullptr;
     SetVector<MachineInstr*> CurrentPredicate;
     SmallVector<VPTBlock, 4> VPTBlocks;
@@ -368,14 +395,23 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   for (auto &Block : VPTBlocks) {
     if (Block.IsPredicatedOn(VCTP))
       continue;
-    if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
+    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                 << *Block.getDivergent()->MI);
+                        << *Block.getDivergent()->MI);
       return false;
     }
     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
     for (auto &PredMI : Insts) {
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
+      // Check the instructions in the block and only allow:
+      //   - VCTPs
+      //   - Instructions predicated on the main VCTP
+      //   - Any VCMP
+      //      - VCMPs just "and" their result with VPR.P0. Whether they are
+      //      located before/after the VCTP is irrelevant - the end result will
+      //      be the same in both cases, so there's no point in requiring them
+      //      to be located after the VCTP!
+      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
+          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
         continue;
       LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
                  << " - which is predicated on:\n";
@@ -474,6 +510,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
     SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
 
+    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+
     if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
       bool FoundSub = false;
 
@@ -622,7 +660,7 @@ bool LowOverheadLoop::ValidateLiveOuts() const {
     if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
       continue;
 
-    if (isVCTP(&MI) || MI.getOpcode() == ARM::MVE_VPST)
+    if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
       continue;
 
     // Predicated loads will write zeros to the falsely predicated bytes of the
@@ -768,29 +806,44 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   if (CannotTailPredicate)
     return false;
 
-  // Only support a single vctp.
-  if (isVCTP(MI) && VCTP)
-    return false;
+  if (isVCTP(MI)) {
+    // If we find another VCTP, check whether it uses the same value as the main VCTP.
+    // If it does, store it in the SecondaryVCTPs set, else refuse it.
+    if (VCTP) {
+      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a 
diff erent reaching "
+                             "definition from the main VCTP");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
+      SecondaryVCTPs.insert(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
+      VCTP = MI;
+    }
+  } else if (isVPTOpcode(MI->getOpcode())) {
+    if (MI->getOpcode() != ARM::MVE_VPST) {
+      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
+             "VPT does not implicitly define VPR?!");
+      CurrentPredicate.insert(MI);
+    }
 
-  // Start a new vpt block when we discover a vpt.
-  if (MI->getOpcode() == ARM::MVE_VPST) {
     VPTBlocks.emplace_back(MI, CurrentPredicate);
     CurrentBlock = &VPTBlocks.back();
     return true;
-  } else if (isVCTP(MI))
-    VCTP = MI;
-  else if (MI->getOpcode() == ARM::MVE_VPSEL ||
-           MI->getOpcode() == ARM::MVE_VPNOT)
+  } else if (MI->getOpcode() == ARM::MVE_VPSEL ||
+             MI->getOpcode() == ARM::MVE_VPNOT) {
+    // TODO: Allow VPSEL and VPNOT, we currently cannot because:
+    // 1) It will use the VPR as a predicate operand, but doesn't have to be
+    //    instead a VPT block, which means we can assert while building up
+    //    the VPT block because we don't find another VPT or VPST to being a new
+    //    one.
+    // 2) VPSEL still requires a VPR operand even after tail predicating,
+    //    which means we can't remove it unless there is another
+    //    instruction, such as vcmp, that can provide the VPR def.
     return false;
-
-  // TODO: Allow VPSEL and VPNOT, we currently cannot because:
-  // 1) It will use the VPR as a predicate operand, but doesn't have to be
-  //    instead a VPT block, which means we can assert while building up
-  //    the VPT block because we don't find another VPST to being a new
-  //    one.
-  // 2) VPSEL still requires a VPR operand even after tail predicating,
-  //    which means we can't remove it unless there is another
-  //    instruction, such as vcmp, that can provide the VPR def.
+  }
 
   bool IsUse = false;
   bool IsDef = false;
@@ -1170,26 +1223,24 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
   };
 
   // There are a few scenarios which we have to fix up:
-  // 1) A VPT block with is only predicated by the vctp and has no internal vpr
-  //    defs.
-  // 2) A VPT block which is only predicated by the vctp but has an internal
-  //    vpr def.
-  // 3) A VPT block which is predicated upon the vctp as well as another vpr
-  //    def.
-  // 4) A VPT block which is not predicated upon a vctp, but contains it and
-  //    all instructions within the block are predicated upon in.
-
+  // 1. VPT Blocks with non-uniform predicates:
+  //    - a. When the divergent instruction is a vctp
+  //    - b. When the block uses a vpst, and is only predicated on the vctp
+  //    - c. When the block uses a vpt and (optionally) contains one or more
+  //         vctp.
+  // 2. VPT Blocks with uniform predicates:
+  //    - a. The block uses a vpst, and is only predicated on the vctp
   for (auto &Block : LoLoop.getVPTBlocks()) {
     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
     if (Block.HasNonUniformPredicate()) {
       PredicatedMI *Divergent = Block.getDivergent();
       if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the block mask of the VPST/VPT will need
+        // The vctp will be removed, so the block mask of the vp(s)t will need
         // to be recomputed.
-        LoLoop.BlockMasksToRecompute.insert(Block.getVPST());
-      } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
-        // The VPT block has a non-uniform predicate but it's entry is guarded
-        // only by a vctp, which means we:
+        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
+      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+        // The VPT block has a non-uniform predicate but it uses a vpst and its
+        // entry is guarded only by a vctp, which means we:
         // - Need to remove the original vpst.
         // - Then need to unpredicate any following instructions, until
         //   we come across the divergent vpr def.
@@ -1197,7 +1248,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         //   the divergent vpr def.
         // TODO: We could be producing more VPT blocks than necessary and could
         // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
+        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
              E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
           RemovePredicate(&*I);
 
@@ -1210,29 +1261,58 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
           ++Size;
           ++I;
         }
-        // Create a VPST with a null mask, we'll recompute it later.
+        // Create a VPST (with a null mask for now, we'll recompute it later).
         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
                                           InsertAt->getDebugLoc(),
                                           TII->get(ARM::MVE_VPST));
         MIB.addImm(0);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
-        LoLoop.ToRemove.insert(Block.getVPST());
+        LoLoop.ToRemove.insert(Block.getPredicateThen());
         LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
       }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
-      // A vpt block which is only predicated upon vctp and has no internal vpr
-      // defs:
+      // Else, if the block uses a vpt, iterate over the block, removing the
+      // extra VCTPs it may contain.
+      else if (Block.isVPT()) {
+        bool RemovedVCTP = false;
+        for (PredicatedMI &Elt : Block.getInsts()) {
+          MachineInstr *MI = Elt.MI;
+          if (isVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
+            LoLoop.ToRemove.insert(MI);
+            RemovedVCTP = true;
+            continue;
+          }
+        }
+        if (RemovedVCTP)
+          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
+      }
+    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
+      // A vpt block starting with VPST, is only predicated upon vctp and has no
+      // internal vpr defs:
       // - Remove vpst.
       // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
-      LoLoop.ToRemove.insert(Block.getVPST());
+      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
+      LoLoop.ToRemove.insert(Block.getPredicateThen());
       for (auto &PredMI : Insts)
         RemovePredicate(PredMI.MI);
     }
   }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
+  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
+  // Remove the "main" VCTP
   LoLoop.ToRemove.insert(LoLoop.VCTP);
+  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
+  // Remove remaining secondary VCTPs
+  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
+    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
+    // The predicated ones should have already been marked for removal when
+    // visiting the VPT blocks.
+    if (LoLoop.ToRemove.insert(VCTP).second) {
+      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
+             "Removing Predicated VCTP without updating the block mask!");
+      LLVM_DEBUG(dbgs() << "    " << *VCTP);
+    }
+  }
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 6b9675dfc091..9af4185cae44 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -449,26 +449,17 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_1: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-NEXT:    vpttt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
-; CHECK-NEXT:    vctpt.32 r3
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB5_1
+; CHECK-NEXT:    letp lr, .LBB5_1
 ; CHECK-NEXT:  @ %bb.2: @ %bb32
 ; CHECK-NEXT:    pop {r7, pc}
 bb:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
index 6df9702ca01d..2f1641516a0d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -118,24 +118,16 @@ body:             |
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r0, $r1, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0)
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
   ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
@@ -143,7 +135,7 @@ body:             |
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
new file mode 100644
index 000000000000..f3c1cc639b0f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
@@ -0,0 +1,937 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+
+--- |
+  @_ZL3arr = internal global [10 x i32] [i32 1, i32 2, i32 3, i32 5, i32 5, i32 5, i32 -2, i32 0, i32 -8, i32 -1], align 4
+  @.str = private unnamed_addr constant [5 x i8] c"%d, \00", align 1
+
+  define arm_aapcs_vfpcc void @vpt_block(i32* nocapture %A, i32 %n, i32 %x) {
+  entry:
+    %cmp9 = icmp sgt i32 %n, 0
+    %0 = add i32 %n, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9, label %vector.ph, label %for.cond.cleanup
+
+  vector.ph:                                        ; preds = %entry
+    %sub = sub nsw i32 0, %x
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %A, %vector.ph ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ]
+    %7 = phi i32 [ %n, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv12, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %10 = insertelement <4 x i32> undef, i32 %x, i32 0
+    %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+    %12 = icmp slt <4 x i32> %wide.masked.load, %11
+    %13 = insertelement <4 x i32> undef, i32 %sub, i32 0
+    %14 = shufflevector <4 x i32> %13, <4 x i32> undef, <4 x i32> zeroinitializer
+    %15 = icmp sgt <4 x i32> %wide.masked.load, %14
+    %16 = and <4 x i1> %12, %15
+    %17 = and <4 x i1> %16, %8
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %17)
+    %scevgep = getelementptr i32, i32* %lsr.iv1, i32 4
+    %18 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+    %19 = icmp ne i32 %18, 0
+    br i1 %19, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    ret void
+  }
+
+  define arm_aapcs_vfpcc void @
diff erent_vcpt_reaching_def(i32* nocapture %A, i32 %n, i32 %x) {
+    ; Intentionally left blank - see MIR sequence below.
+    entry:
+      unreachable
+    vector.body:
+      unreachable
+    for.cond.cleanup:
+      unreachable
+  }
+
+  define arm_aapcs_vfpcc void @
diff erent_vcpt_operand(i32* nocapture %A, i32 %n, i32 %x) {
+    ; Intentionally left blank - see MIR sequence below.
+    entry:
+      unreachable
+    vector.body:
+      unreachable
+    for.cond.cleanup:
+      unreachable
+  }
+
+  define arm_aapcs_vfpcc void @else_vcpt(i32* nocapture %data, i32 %N, i32 %T) {
+  entry:
+    %cmp9 = icmp sgt i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9, label %vector.ph, label %for.cond.cleanup
+
+  vector.ph:                                        ; preds = %entry
+    %sub = sub nsw i32 0, %T
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %data, %vector.ph ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv12, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %10 = insertelement <4 x i32> undef, i32 %T, i32 0
+    %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer
+    %12 = icmp slt <4 x i32> %wide.masked.load, %11
+    %13 = insertelement <4 x i32> undef, i32 %sub, i32 0
+    %14 = shufflevector <4 x i32> %13, <4 x i32> undef, <4 x i32> zeroinitializer
+    %15 = icmp sgt <4 x i32> %wide.masked.load, %14
+    %16 = or <4 x i1> %12, %15
+    %17 = and <4 x i1> %16, %8
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %17)
+    %scevgep = getelementptr i32, i32* %lsr.iv1, i32 4
+    %18 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+    %19 = icmp ne i32 %18, 0
+    br i1 %19, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    ret void
+  }
+
+  define arm_aapcs_vfpcc void @loop_invariant_vpt_operands(i32* nocapture %A, i32 %n, i32 %x) {
+    ; Intentionally left blank - see MIR sequence below.
+    entry:
+      unreachable
+    vector.body:
+      unreachable
+    for.cond.cleanup:
+      unreachable
+  }
+
+  define arm_aapcs_vfpcc void @vctp_before_vpt(i32* nocapture %A, i32 %n, i32 %x) {
+    ; Intentionally left blank - see MIR sequence below.
+    entry:
+      unreachable
+    vector.body:
+      unreachable
+    for.cond.cleanup:
+      unreachable
+  }
+
+  define arm_aapcs_vfpcc void @vpt_load_vctp_store(i32* nocapture %A, i32 %n, i32 %x) {
+    ; Intentionally left blank - see MIR sequence below.
+    entry:
+      unreachable
+    vector.body:
+      unreachable
+    for.cond.cleanup:
+      unreachable
+  }
+
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+...
+---
+name:            vpt_block
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: vpt_block
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+    MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
+    renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            
diff erent_vcpt_reaching_def
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: 
diff erent_vcpt_reaching_def
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r1 = MVE_VSTRWU32_post renamable $q0, killed renamable $r1, 16, 1, renamable $vpr
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  ;
+  ; Tests that secondary VCTPs are refused when their operand's reaching definition is not the same as the main
+  ; VCTP's.
+  ;
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $r1 = MVE_VSTRWU32_post renamable $q0, killed renamable $r1, 16, 1, renamable $vpr
+    renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+    MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
+    renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            
diff erent_vcpt_operand
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: 
diff erent_vcpt_operand
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  ;
+  ; Tests that secondary VCTPs are refused when their operand is not the same register as the main VCTP's.
+  ;
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+    MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+    renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            else_vcpt
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: else_vcpt
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 12, renamable $q1, renamable $r2, 10, implicit-def $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  ;
+  ; Test including a else-predicated VCTP.
+  ;
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+    MVE_VPTv4s32r 14, renamable $q1, renamable $r2, 10, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 2, killed renamable $vpr
+    renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            loop_invariant_vpt_operands
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: loop_invariant_vpt_operands
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
+    MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
+    renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            vctp_before_vpt
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: vctp_before_vpt
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r2, $r3
+  ; CHECK:   MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 8, implicit-def $vpr
+  ; CHECK:   dead renamable $vpr = MVE_VCMPs32r renamable $q0, renamable $r3, 12, 1, killed renamable $vpr
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 8, implicit-def $vpr
+    renamable $vpr = MVE_VCMPs32r killed renamable $q0, renamable $r3, 12, 1, killed renamable $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...
+---
+name:            vpt_load_vctp_store
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: vpt_load_vctp_store
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 11, 8, implicit-def $itstate
+  ; CHECK:   frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   dead renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK: bb.1.vector.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2
+  ; CHECK:   MVE_VPTv4s32r 2, killed renamable $q0, renamable $r2, 2, implicit-def $vpr
+  ; CHECK:   renamable $q0 = MVE_VLDRWU32 renamable $r0, 0, 1, $vpr
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed $vpr
+  ; CHECK:   MVE_VSTRWU32 renamable $q0, renamable $r0, 0, 1, killed $vpr
+  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+  ;
+  ; This shouldn't be tail-predicated because the VLDR isn't predicated on the VCTP.
+  ;
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.vector.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3
+
+    MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 2, implicit-def $vpr
+    renamable $q0 = MVE_VLDRWU32 renamable $r0, 0, 1, $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r1, 1, $vpr
+    MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, $vpr
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.cond.cleanup:
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
+...


        


More information about the llvm-commits mailing list