[llvm] 4042518 - [ARM][MVE] Tail predicate in the presence of vcmp

Fri Dec 20 00:43:07 PST 2019

Author: Sam Parker
Date: 2019-12-20T08:42:11Z
New Revision: 404251833521770732646c4348f774b94b40df72

URL: https://github.com/llvm/llvm-project/commit/404251833521770732646c4348f774b94b40df72
DIFF: https://github.com/llvm/llvm-project/commit/404251833521770732646c4348f774b94b40df72.diff

LOG: [ARM][MVE] Tail predicate in the presence of vcmp

Record the discovered VPT blocks while checking for validity and, for
now, only handle blocks that begin with VPST and not VPT. We're now
allowing more than one instruction to define vpr, but each block must
somehow be predicated using the vctp. This leaves us with several
scenarios which need fixing up:
1) A VPT block with is only predicated by the vctp and has no
   internal vpr defs.
2) A VPT block which is only predicated by the vctp but has an
   internal vpr def.
3) A VPT block which is predicated upon the vctp as well as another
   vpr def.
4) A VPT block which is not predicated upon a vctp, but contains it
   and all instructions within the block are predicated upon in.

The changes needed are, for:
1) The easy one, just remove the vpst and unpredicate the
   instructions in the block.
2) Remove the vpst and unpredicate the instructions up to the
   internal vpr def. Need insert a new vpst to predicate the
   remaining instructions.
3) No nothing.
4) The vctp will be inside a vpt and the instruction will be removed,
   so adjust the size of the mask on the vpst.

Differential Revision: https://reviews.llvm.org/D71107

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
    llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index e3a05408343e..ec62a6975f00 100644

--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -22,6 +22,7 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBasicBlockInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
@@ -37,6 +38,65 @@ using namespace llvm;
 
 namespace {
 
+  struct PredicatedMI {
+    MachineInstr *MI = nullptr;
+    SetVector<MachineInstr*> Predicates;
+
+  public:
+    PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
+    MI(I) {
+      Predicates.insert(Preds.begin(), Preds.end());
+    }
+  };
+
+  // Represent a VPT block, a list of instructions that begins with a VPST and
+  // has a maximum of four proceeding instructions. All instructions within the
+  // block are predicated upon the vpr and we allow instructions to define the
+  // vpr within in the block too.
+  class VPTBlock {
+    std::unique_ptr<PredicatedMI> VPST;
+    PredicatedMI *Divergent = nullptr;
+    SmallVector<PredicatedMI, 4> Insts;
+
+  public:
+    VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
+      VPST = std::make_unique<PredicatedMI>(MI, Preds);
+    }
+
+    void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
+      if (!Divergent && !set_
diff erence(Preds, VPST->Predicates).empty()) {
+        Divergent = &Insts.back();
+        LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
+      }
+      Insts.emplace_back(MI, Preds);
+      assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+    }
+
+    // Have we found an instruction within the block which defines the vpr? If
+    // so, not all the instructions in the block will have the same predicate.
+    bool HasNonUniformPredicate() const {
+      return Divergent != nullptr;
+    }
+
+    // Is the given instruction part of the predicate set controlling the entry
+    // to the block.
+    bool IsPredicatedOn(MachineInstr *MI) const {
+      return VPST->Predicates.count(MI);
+    }
+
+    // Is the given instruction the only predicate which controls the entry to
+    // the block.
+    bool IsOnlyPredicatedOn(MachineInstr *MI) const {
+      return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
+    }
+
+    unsigned size() const { return Insts.size(); }
+    SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
+    MachineInstr *getVPST() const { return VPST->MI; }
+    PredicatedMI *getDivergent() const { return Divergent; }
+  };
+
   struct LowOverheadLoop {
 
     MachineLoop *ML = nullptr;
@@ -46,39 +106,17 @@ namespace {
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineInstr *VCTP = nullptr;
-    SmallVector<MachineInstr*, 4> VPTUsers;
+    VPTBlock *CurrentBlock = nullptr;
+    SetVector<MachineInstr*> CurrentPredicate;
+    SmallVector<VPTBlock, 4> VPTBlocks;
     bool Revert = false;
-    bool FoundOneVCTP = false;
     bool CannotTailPredicate = false;
 
     LowOverheadLoop(MachineLoop *ML) : ML(ML) {
       MF = ML->getHeader()->getParent();
     }
 
-    // For now, only support one vctp instruction. If we find multiple then
-    // we shouldn't perform tail predication.
-    void addVCTP(MachineInstr *MI) {
-      if (!VCTP) {
-        VCTP = MI;
-        FoundOneVCTP = true;
-      } else
-        FoundOneVCTP = false;
-    }
-
-    // Check that nothing else is writing to VPR and record any insts
-    // reading the VPR.
-    void ScanForVPR(MachineInstr *MI) {
-      for (auto &MO : MI->operands()) {
-        if (!MO.isReg() || MO.getReg() != ARM::VPR)
-          continue;
-        if (MO.isUse())
-          VPTUsers.push_back(MI);
-        if (MO.isDef()) {
-          CannotTailPredicate = true;
-          break;
-        }
-      }
-    }
+    bool RecordVPTBlocks(MachineInstr *MI);
 
     // If this is an MVE instruction, check that we know how to use tail
     // predication with it.
@@ -86,6 +124,11 @@ namespace {
       if (CannotTailPredicate)
         return;
 
+      if (!RecordVPTBlocks(MI)) {
+        CannotTailPredicate = true;
+        return;
+      }
+
       const MCInstrDesc &MCID = MI->getDesc();
       uint64_t Flags = MCID.TSFlags;
       if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
@@ -100,7 +143,7 @@ namespace {
     bool IsTailPredicationLegal() const {
       // For now, let's keep things really simple and only support a single
       // block for tail predication.
-      return !Revert && FoundAllComponents() && FoundOneVCTP &&
+      return !Revert && FoundAllComponents() && VCTP &&
              !CannotTailPredicate && ML->getNumBlocks() == 1;
     }
 
@@ -118,6 +161,8 @@ namespace {
       return Start && Dec && End;
     }
 
+    SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+
     // Return the loop iteration count, or the number of elements if we're tail
     // predicating.
     MachineOperand &getCount() {
@@ -191,7 +236,7 @@ namespace {
 
     void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
 
-    void RemoveVPTBlocks(LowOverheadLoop &LoLoop);
+    void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
 
     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
 
@@ -281,13 +326,39 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
   } else
     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
 
+  if (!IsTailPredicationLegal()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Tail-predication is not valid.\n");
+    return;
+  }
+
+  // All predication within the loop should be based on vctp. If the block
+  // isn't predicated on entry, check whether the vctp is within the block
+  // and that all other instructions are then predicated on it.
+  for (auto &Block : VPTBlocks) {
+    if (Block.IsPredicatedOn(VCTP))
+      continue;
+    if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
+      CannotTailPredicate = true;
+      return;
+    }
+    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
+    for (auto &PredMI : Insts) {
+      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
+        continue;
+      LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
+                        << " - which is predicated on:\n";
+                        for (auto *MI : PredMI.Predicates)
+                          dbgs() << "   - " << *MI;
+                 );
+      CannotTailPredicate = true;
+      return;
+    }
+  }
+
   // For tail predication, we need to provide the number of elements, instead
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
-  if (!IsTailPredicationLegal())
-    return;
-
   Register NumElements = VCTP->getOperand(1).getReg();
 
   // If the register is defined within loop, then we can't perform TP.
@@ -338,9 +409,65 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
     MBB = *MBB->pred_begin();
   }
 
-  LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication to convert:\n";
-               for (auto *MI : VPTUsers)
-                 dbgs() << " - " << *MI;);
+  LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
+}
+
+bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
+  // Only support a single vctp.
+  if (isVCTP(MI) && VCTP)
+    return false;
+
+  // Start a new vpt block when we discover a vpt.
+  if (MI->getOpcode() == ARM::MVE_VPST) {
+    VPTBlocks.emplace_back(MI, CurrentPredicate);
+    CurrentBlock = &VPTBlocks.back();
+    return true;
+  }
+
+  if (isVCTP(MI))
+    VCTP = MI;
+
+  unsigned VPROpNum = MI->getNumOperands() - 1;
+  bool IsUse = false;
+  if (MI->getOperand(VPROpNum).isReg() &&
+      MI->getOperand(VPROpNum).getReg() == ARM::VPR &&
+      MI->getOperand(VPROpNum).isUse()) {
+    // If this instruction is predicated by VPR, it will be its last
+    // operand.  Also check that it's only 'Then' predicated.
+    if (!MI->getOperand(VPROpNum-1).isImm() ||
+        MI->getOperand(VPROpNum-1).getImm() != ARMVCC::Then) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found unhandled predicate on: "
+                 << *MI);
+      return false;
+    }
+    CurrentBlock->addInst(MI, CurrentPredicate);
+    IsUse = true;
+  }
+
+  bool IsDef = false;
+  for (unsigned i = 0; i < MI->getNumOperands() - 1; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.getReg() != ARM::VPR)
+      continue;
+
+    if (MO.isDef()) {
+      CurrentPredicate.insert(MI);
+      IsDef = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
+      return false;
+    }
+  }
+
+  // If we find a vpr def that is not already predicated on the vctp, we've
+  // got disjoint predicates that may not be equivalent when we do the
+  // conversion.
+  if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
+    return false;
+  }
+
+  return true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -422,8 +549,6 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
         LoLoop.End = &MI;
       else if (isLoopStart(MI))
         LoLoop.Start = &MI;
-      else if (isVCTP(&MI))
-        LoLoop.addVCTP(&MI);
       else if (MI.getDesc().isCall()) {
         // TODO: Though the call will require LE to execute again, does this
         // mean we should revert? Always executing LE hopefully should be
@@ -431,10 +556,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
         LoLoop.Revert = true;
         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
       } else {
-        // Once we've found a vctp, record the users of vpr and check there's
-        // no more vpr defs.
-        if (LoLoop.FoundOneVCTP)
-          LoLoop.ScanForVPR(&MI);
+        // Record VPR defs and build up their corresponding vpt blocks.
         // Check we know how to tail predicate any mve instructions.
         LoLoop.CheckTPValidity(&MI);
       }
@@ -669,27 +791,81 @@ void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
   }
 }
 
-void ARMLowOverheadLoops::RemoveVPTBlocks(LowOverheadLoop &LoLoop) {
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
-  LoLoop.VCTP->eraseFromParent();
+void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
+  auto RemovePredicate = [](MachineInstr *MI) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
+    unsigned OpNum = MI->getNumOperands() - 1;
+    assert(MI->getOperand(OpNum-1).getImm() == ARMVCC::Then &&
+           "Expected Then predicate!");
+    MI->getOperand(OpNum-1).setImm(ARMVCC::None);
+    MI->getOperand(OpNum).setReg(0);
+  };
 
-  for (auto *MI : LoLoop.VPTUsers) {
-    if (MI->getOpcode() == ARM::MVE_VPST) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *MI);
-      MI->eraseFromParent();
-    } else {
-      unsigned OpNum = MI->getNumOperands() - 1;
-      assert((MI->getOperand(OpNum).isReg() &&
-              MI->getOperand(OpNum).getReg() == ARM::VPR) &&
-             "Expected VPR");
-      assert((MI->getOperand(OpNum-1).isImm() &&
-              MI->getOperand(OpNum-1).getImm() == ARMVCC::Then) &&
-             "Expected Then predicate");
-      MI->getOperand(OpNum-1).setImm(ARMVCC::None);
-      MI->getOperand(OpNum).setReg(0);
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removed predicate from: " << *MI);
+  // There are a few scenarios which we have to fix up:
+  // 1) A VPT block with is only predicated by the vctp and has no internal vpr
+  //    defs.
+  // 2) A VPT block which is only predicated by the vctp but has an internal
+  //    vpr def.
+  // 3) A VPT block which is predicated upon the vctp as well as another vpr
+  //    def.
+  // 4) A VPT block which is not predicated upon a vctp, but contains it and
+  //    all instructions within the block are predicated upon in.
+
+  for (auto &Block : LoLoop.getVPTBlocks()) {
+    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
+    if (Block.HasNonUniformPredicate()) {
+      PredicatedMI *Divergent = Block.getDivergent();
+      if (isVCTP(Divergent->MI)) {
+        // The vctp will be removed, so the size of the vpt block needs to be
+        // modified.
+        uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
+        Block.getVPST()->getOperand(0).setImm(Size);
+        LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
+      } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+        // The VPT block has a non-uniform predicate but it's entry is guarded
+        // only by a vctp, which means we:
+        // - Need to remove the original vpst.
+        // - Then need to unpredicate any following instructions, until
+        //   we come across the divergent vpr def.
+        // - Insert a new vpst to predicate the instruction(s) that following
+        //   the divergent vpr def.
+        // TODO: We could be producing more VPT blocks than necessary and could
+        // fold the newly created one into a proceeding one.
+        for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
+             E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+          RemovePredicate(&*I);
+
+        unsigned Size = 0;
+        auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
+        auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
+        MachineInstr *InsertAt = nullptr;
+        while (I != E) {
+          InsertAt = &*I;
+          ++Size;
+          ++I;
+        }
+        MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
+                                          InsertAt->getDebugLoc(),
+                                          TII->get(ARM::MVE_VPST));
+        MIB.addImm(getARMVPTBlockMask(Size));
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+        Block.getVPST()->eraseFromParent();
+      }
+    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+      // A vpt block which is only predicated upon vctp and has no internal vpr
+      // defs:
+      // - Remove vpst.
+      // - Unpredicate the remaining instructions.
+      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+      Block.getVPST()->eraseFromParent();
+      for (auto &PredMI : Insts)
+        RemovePredicate(PredMI.MI);
     }
   }
+
+  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
+  LoLoop.VCTP->eraseFromParent();
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
@@ -743,7 +919,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     RemoveDeadBranch(LoLoop.End);
     if (LoLoop.IsTailPredicationLegal()) {
       RemoveLoopUpdate(LoLoop);
-      RemoveVPTBlocks(LoLoop);
+      ConvertVPTBlocks(LoLoop);
     }
   }
 }

diff  --git a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index 39d90d0b6dbb..c8b725f339e2 100644
--- a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -137,23 +137,7 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
       ++MBIter;
     };
 
-    unsigned BlockMask = 0;
-    switch (VPTInstCnt) {
-    case 1:
-      BlockMask = VPTMaskValue::T;
-      break;
-    case 2:
-      BlockMask = VPTMaskValue::TT;
-      break;
-    case 3:
-      BlockMask = VPTMaskValue::TTT;
-      break;
-    case 4:
-      BlockMask = VPTMaskValue::TTTT;
-      break;
-    default:
-      llvm_unreachable("Unexpected number of instruction in a VPT block");
-    };
+    unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt);
 
     // Search back for a VCMP that can be folded to create a VPT, or else create
     // a VPST directly

diff  --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index 11cb1a162e2b..27605422983d 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -91,6 +91,40 @@ namespace ARMVCC {
     Then,
     Else
   };
+
+  enum VPTMaskValue {
+    T     =  8, // 0b1000
+    TT    =  4, // 0b0100
+    TE    = 12, // 0b1100
+    TTT   =  2, // 0b0010
+    TTE   =  6, // 0b0110
+    TEE   = 10, // 0b1010
+    TET   = 14, // 0b1110
+    TTTT  =  1, // 0b0001
+    TTTE  =  3, // 0b0011
+    TTEE  =  5, // 0b0101
+    TTET  =  7, // 0b0111
+    TEEE  =  9, // 0b1001
+    TEET  = 11, // 0b1011
+    TETT  = 13, // 0b1101
+    TETE  = 15  // 0b1111
+  };
+}
+
+inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
+  switch (NumInsts) {
+  case 1:
+    return ARMVCC::T;
+  case 2:
+    return ARMVCC::TT;
+  case 3:
+    return ARMVCC::TTT;
+  case 4:
+    return ARMVCC::TTTT;
+  default:
+    break;
+  };
+  llvm_unreachable("Unexpected number of instruction in a VPT block");
 }
 
 inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index c2bc193530f6..51c625c03e0c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -1,25 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
 
-; CHECK-LABEL: vpsel_mul_reduce_add
-; CHECK:      dls lr, lr
-; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
-; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
-; CHECK:      vstr p0, [sp
-; CHECK:      vpstt	
-; CHECK-NEXT: vldrwt.u32
-; CHECK-NEXT: vldrwt.u32
-; CHECK:      vcmp.i32
-; CHECK:      vpsel
-; CHECK:      vldr p0, [sp
-; CHECK:      vpst	
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
-; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
-; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32	[[ELEMS_OUT]]
-; CHECK-NEXT: vpsel
-; CHECK-NEXT: vaddv.u32
 define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: vpsel_mul_reduce_add:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    itt eq
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    bxeq lr
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    adds r4, r3, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r4, r4, #3
+; CHECK-NEXT:    sub.w r12, r4, #4
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r3
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    and r3, r12, #15
+; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    vdup.32 q3, r3
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r2], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
+; CHECK-NEXT:    vcmp.i32 eq, q3, zr
+; CHECK-NEXT:    vpsel q1, q2, q1
+; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT:    vmul.i32 q1, q1, q2
+; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    subs r3, r4, #4
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %middle.block
+; CHECK-NEXT:    vctp.32 r4
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vaddv.u32 r0, q0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -69,29 +94,57 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
   ret i32 %res.0.lcssa
 }
 
-; CHECK-LABEL: vpsel_mul_reduce_add_2
-; CHECK:      dls lr, lr
-; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
-; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
-; CHECK:      vstr p0, [sp
-; CHECK:      vpstt
-; CHECK-NEXT: vldrwt.u32
-; CHECK-NEXT: vldrwt.u32
-; CHECK;      vsub
-; CHECK:      vpst
-; CHECK-NEXT: vldrwt.u32
-; CHECK:      vcmp.i32
-; CHECK:      vpsel
-; CHECK:      vldr p0, [sp
-; CHECK:      vpst	
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
-; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
-; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32	[[ELEMS_OUT]]
-; CHECK-NEXT: vpsel
-; CHECK-NEXT: vaddv.u32
 define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+; CHECK-LABEL: vpsel_mul_reduce_add_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    ldr r5, [sp, #20]
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    beq .LBB1_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    adds r4, r5, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r4, r4, #3
+; CHECK-NEXT:    sub.w r12, r4, #4
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r5
+; CHECK-NEXT:    mov r4, r5
+; CHECK-NEXT:    and r5, r12, #15
+; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
+; CHECK-NEXT:    vdup.32 q3, r5
+; CHECK-NEXT:    vsub.i32 q1, q2, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
+; CHECK-NEXT:    vcmp.i32 eq, q3, zr
+; CHECK-NEXT:    vpsel q1, q1, q2
+; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT:    vmul.i32 q1, q1, q2
+; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    subs r5, r4, #4
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vctp.32 r4
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vaddv.u32 r0, q0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
                                          i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
@@ -146,23 +199,38 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
   ret i32 %res.0.lcssa
 }
 
-; CHECK-LABEL: and_mul_reduce_add
-; CHECK:      dls lr, lr
-; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
-; CHECK:      vpstt	
-; CHECK-NEXT: vldrwt.u32
-; CHECK-NEXT: vldrwt.u32
-; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
-; CHECK:      sub{{.*}} [[ELEMS]],{{.*}}#4
-; CHECK:      vpsttt
-; CHECK-NEXT: vcmpt.i32	eq, {{.*}}, zr
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
-; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS_OUT]]
-; CHECK:      vpsel
 define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+; CHECK-LABEL: and_mul_reduce_add:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    ldr r5, [sp, #16]
+; CHECK-NEXT:    cbz r5, .LBB2_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r5
+; CHECK-NEXT:  .LBB2_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    mov r12, r5
+; CHECK-NEXT:    vsub.i32 q1, q2, q1
+; CHECK-NEXT:    subs r5, #4
+; CHECK-NEXT:    vcmp.i32 eq, q1, zr
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    letp lr, .LBB2_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vaddv.u32 r0, q0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
                                          i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
@@ -214,31 +282,55 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
   ret i32 %res.0.lcssa
 }
 
-; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
-; CHECK-LABEL: or_mul_reduce_add
-; CHECK:      dls lr, lr
-; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
-; CHECK:      vstr p0, [sp
-; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
-; CHECK:      vpstt	
-; CHECK-NEXT: vldrwt.u32
-; CHECK-NEXT: vldrwt.u32
-; CHECK:      vcmp.i32	eq, {{.*}}, zr
-; CHECK:      vmrs [[VCMP:r[0-9]+]], p0
-; CHECK:      vldr p0, [sp
-; CHECK:      vmrs [[VCTP:r[0-9]+]], p0
-; CHECK:      orr{{.*}} [[VCMP]], [[VCTP]]
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4
-; CHECK-NEXT: vmsr p0
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
-; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
-; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS_OUT]]
-; CHECK:      vpsel
-define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
-                                        i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+; CHECK-LABEL: or_mul_reduce_add:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    ldr r5, [sp, #20]
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    beq .LBB3_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    adds r4, r5, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r4, r4, #3
+; CHECK-NEXT:    sub.w r12, r4, #4
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    add.w lr, r4, r12, lsr #2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB3_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r5
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
+; CHECK-NEXT:    mov r12, r5
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT:    vsub.i32 q1, q2, q1
+; CHECK-NEXT:    vcmp.i32 eq, q1, zr
+; CHECK-NEXT:    vmrs r4, p0
+; CHECK-NEXT:    vldr p0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmrs r5, p0
+; CHECK-NEXT:    orrs r4, r5
+; CHECK-NEXT:    sub.w r5, r12, #4
+; CHECK-NEXT:    vmsr p0, r4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB3_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vaddv.u32 r0, q0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -289,8 +381,141 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
   ret i32 %res.0.lcssa
 }
 
+define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2) {
+; CHECK-LABEL: continue_on_zero:
+; CHECK:       @ %bb.0: @ %bb
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:  .LBB4_1: @ %bb9
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    letp lr, .LBB4_1
+; CHECK-NEXT:  @ %bb.2: @ %bb27
+; CHECK-NEXT:    pop {r7, pc}
+bb:
+  %tmp = icmp eq i32 %arg2, 0
+  br i1 %tmp, label %bb27, label %bb3
+
+bb3:                                              ; preds = %bb
+  %tmp4 = add i32 %arg2, 3
+  %tmp5 = and i32 %tmp4, -4
+  %tmp6 = add i32 %arg2, -1
+  %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0
+  %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %bb9
+
+bb9:                                              ; preds = %bb9, %bb3
+  %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
+  %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0
+  %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp13 = add <4 x i32> %tmp12, <i32 0, i32 1, i32 2, i32 3>
+  %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
+  %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
+  %tmp16 = bitcast i32* %tmp14 to <4 x i32>*
+  %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
+  %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
+  %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp10
+  %tmp20 = and <4 x i1> %tmp18, %tmp15
+  %tmp21 = bitcast i32* %tmp19 to <4 x i32>*
+  %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp21, i32 4, <4 x i1> %tmp20, <4 x i32> undef)
+  %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
+  %tmp24 = bitcast i32* %tmp19 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %tmp24, i32 4, <4 x i1> %tmp20)
+  %tmp25 = add i32 %tmp10, 4
+  %tmp26 = icmp eq i32 %tmp25, %tmp5
+  br i1 %tmp26, label %bb27, label %bb9
+
+bb27:                                             ; preds = %bb9, %bb
+  ret void
+}
+
+define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32 %arg3) {
+; CHECK-LABEL: range_test:
+; CHECK:       @ %bb.0: @ %bb
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    bic r12, r12, #3
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB5_1: @ %bb12
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q1, [r2], #16
+; CHECK-NEXT:    vpttt.s32 ge, q0, q1
+; CHECK-NEXT:    vcmpt.i32 ne, q1, zr
+; CHECK-NEXT:    vctpt.32 r3
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q1, [r0]
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    le lr, .LBB5_1
+; CHECK-NEXT:  @ %bb.2: @ %bb32
+; CHECK-NEXT:    pop {r7, pc}
+bb:
+  %tmp = icmp eq i32 %arg3, 0
+  br i1 %tmp, label %bb32, label %bb4
+
+bb4:                                              ; preds = %bb
+  %tmp5 = add i32 %arg3, 3
+  %tmp6 = and i32 %tmp5, -4
+  %tmp7 = add i32 %arg3, -1
+  %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0
+  %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
+  %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %bb12
+
+bb12:                                             ; preds = %bb12, %bb4
+  %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
+  %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0
+  %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp16 = add <4 x i32> %tmp15, <i32 0, i32 1, i32 2, i32 3>
+  %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
+  %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
+  %tmp19 = bitcast i32* %tmp17 to <4 x i32>*
+  %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
+  %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer
+  %tmp22 = icmp sle <4 x i32> %tmp20, %tmp11
+  %tmp23 = getelementptr inbounds i32, i32* %arg1, i32 %tmp13
+  %tmp24 = and <4 x i1> %tmp22, %tmp21
+  %tmp25 = and <4 x i1> %tmp24, %tmp18
+  %tmp26 = bitcast i32* %tmp23 to <4 x i32>*
+  %tmp27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp26, i32 4, <4 x i1> %tmp25, <4 x i32> undef)
+  %tmp28 = mul nsw <4 x i32> %tmp27, %tmp20
+  %tmp29 = bitcast i32* %tmp17 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp28, <4 x i32>* %tmp29, i32 4, <4 x i1> %tmp25)
+  %tmp30 = add i32 %tmp13, 4
+  %tmp31 = icmp eq i32 %tmp30, %tmp6
+  br i1 %tmp31, label %bb32, label %bb12
+
+bb32:                                             ; preds = %bb12, %bb
+  ret void
+}
+
 ; Function Attrs: argmemonly nounwind readonly willreturn
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 
 ; Function Attrs: nounwind readnone willreturn
 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
new file mode 100644
index 000000000000..cafd1317c57c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@@ -0,0 +1,230 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  @mask = external global i16
+  ; Function Attrs: nofree norecurse nounwind
+  define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32* noalias nocapture readonly %arg3) local_unnamed_addr #0 {
+  bb:
+    %tmp = icmp eq i32 %arg2, 0
+    %tmp1 = add i32 %arg2, 3
+    %tmp2 = lshr i32 %tmp1, 2
+    %tmp3 = shl nuw i32 %tmp2, 2
+    %tmp4 = add i32 %tmp3, -4
+    %tmp5 = lshr i32 %tmp4, 2
+    %tmp6 = add nuw nsw i32 %tmp5, 1
+    %mask.gep9 = bitcast i16* @mask to i16*
+    %mask.load = load i16, i16* %mask.gep9
+    %conv.mask = zext i16 %mask.load to i32
+    %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
+    %mask.insert = insertelement <4 x i32> undef, i32 %conv.mask, i32 0
+    %invariant.limits = shufflevector <4 x i32> %mask.insert, <4 x i32> undef, <4 x i32> zeroinitializer
+    br i1 %tmp, label %bb27, label %bb3
+
+  bb3:                                              ; preds = %bb
+    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    %scevgep1 = getelementptr i32, i32* %arg3, i32 -4
+    br label %bb9
+
+  bb9:                                              ; preds = %bb9, %bb3
+    %lsr.iv4 = phi i32* [ %scevgep6, %bb9 ], [ %scevgep1, %bb3 ]
+    %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
+    %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
+    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
+    %lsr.iv47 = bitcast i32* %lsr.iv4 to <4 x i32>*
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+    %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
+    %and = and <4 x i1> %vctp, %invariant.mask
+    %tmp11 = sub i32 %tmp8, 4
+    %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
+    %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef)
+    %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
+    %scevgep8 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv47, i32 1
+    %load.limits = load <4 x i32>, <4 x i32>* %scevgep8
+    %bad.icmp = icmp ule <4 x i32> %load.limits, %invariant.limits
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %bad.icmp)
+    %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
+    %tmp13 = icmp ne i32 %tmp12, 0
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+    %scevgep6 = getelementptr i32, i32* %lsr.iv4, i32 4
+    br i1 %tmp13, label %bb9, label %bb27
+
+  bb27:                                             ; preds = %bb9, %bb
+    ret void
+  }
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
+  declare void @llvm.stackprotector(i8*, i8**) #5
+
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       20
+  offsetAdjustment: -12
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.bb:
+  ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r6, -12
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -16
+  ; CHECK:   $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa $r7, 8
+  ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+  ; CHECK:   tCBZ $r2, %bb.3
+  ; CHECK: bb.1.bb3:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
+  ; CHECK:   renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+  ; CHECK:   $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
+  ; CHECK:   renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+  ; CHECK:   renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
+  ; CHECK:   $vpr = VMSR_P0 $r12, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
+  ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+  ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
+  ; CHECK:   $lr = t2DLS renamable $lr
+  ; CHECK: bb.2.bb9:
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+  ; CHECK:   MVE_VPST 2, implicit $vpr
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
+  ; CHECK:   renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
+  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
+  ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
+  ; CHECK:   renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
+  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+  ; CHECK: bb.3.bb27:
+  ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
+  bb.0.bb:
+    successors: %bb.3(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    frame-setup CFI_INSTRUCTION offset $r6, -12
+    frame-setup CFI_INSTRUCTION offset $r4, -16
+    $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa $r7, 8
+    $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+    tCBZ $r2, %bb.3
+
+  bb.1.bb3:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
+    renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+    $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
+    renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
+    renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
+    renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
+    $vpr = VMSR_P0 $r12, 14, $noreg
+    renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
+    renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
+    VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+    $r3 = tMOVr $r0, 14, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.2.bb9:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12
+
+    renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+    MVE_VPST 2, implicit $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+    renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
+    renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
+    renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+    renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
+    renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
+    MVE_VPST 8, implicit $vpr
+    MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    $r0 = tMOVr $r3, 14, $noreg
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14, $noreg
+
+  bb.3.bb27:
+    $sp = tADDspi $sp, 1, 14, $noreg
+    tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
new file mode 100644
index 000000000000..83dc8731f654
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -0,0 +1,230 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  ; Function Attrs: nofree norecurse nounwind
+  define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i16 zeroext %mask) local_unnamed_addr #0 {
+  bb:
+    %tmp = icmp eq i32 %arg2, 0
+    %tmp1 = add i32 %arg2, 3
+    %tmp2 = lshr i32 %tmp1, 2
+    %tmp3 = shl nuw i32 %tmp2, 2
+    %tmp4 = add i32 %tmp3, -4
+    %tmp5 = lshr i32 %tmp4, 2
+    %tmp6 = add nuw nsw i32 %tmp5, 1
+    %conv.mask = zext i16 %mask to i32
+    %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
+    br i1 %tmp, label %bb27, label %bb3
+
+  bb3:                                              ; preds = %bb
+    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    br label %bb9
+
+  bb9:                                              ; preds = %bb9, %bb3
+    %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
+    %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
+    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
+    %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
+    %and = and <4 x i1> %vctp, %invariant.mask
+    %tmp11 = sub i32 %tmp8, 4
+    %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
+    %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
+    %tmp20 = and <4 x i1> %tmp18, %vctp
+    %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef), !tbaa !3
+    %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20), !tbaa !3
+    %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
+    %tmp13 = icmp ne i32 %tmp12, 0
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+    br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7
+
+  bb27:                                             ; preds = %bb9, %bb
+    ret void
+  }
+  ; Function Attrs: argmemonly nounwind readonly willreturn
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
+  ; Function Attrs: argmemonly nounwind willreturn
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #5
+
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" }
+  attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" }
+  attributes #3 = { noduplicate nounwind "target-features"="+mve" }
+  attributes #4 = { nounwind readnone "target-features"="+mve" }
+  attributes #5 = { nounwind }
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"min_enum_size", i32 4}
+  !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"int", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = distinct !{!7, !8}
+  !8 = !{!"llvm.loop.isvectorized", i32 1}
+
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       12
+  offsetAdjustment: -4
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.bb:
+  ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   $r7 = frame-setup tMOVr $sp, 14, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+  ; CHECK:   tCBZ $r2, %bb.3
+  ; CHECK: bb.1.bb3:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+  ; CHECK:   renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+  ; CHECK:   $vpr = VMSR_P0 killed $r3, 14, $noreg
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+  ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+  ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+  ; CHECK:   $lr = t2DLS renamable $lr
+  ; CHECK: bb.2.bb9:
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+  ; CHECK:   MVE_VPST 4, implicit $vpr
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
+  ; CHECK:   MVE_VPST 4, implicit $vpr
+  ; CHECK:   renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
+  ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+  ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+  ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
+  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+  ; CHECK: bb.3.bb27:
+  ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  bb.0.bb:
+    successors: %bb.3(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $r3, $lr
+
+    frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+    tCBZ $r2, %bb.3
+
+  bb.1.bb3:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+    $vpr = VMSR_P0 killed $r3, 14, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+    $r3 = tMOVr $r0, 14, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.2.bb9:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3
+
+    renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+    MVE_VPST 4, implicit $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+    renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
+    renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+    renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    MVE_VPST 8, implicit $vpr
+    MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    $r0 = tMOVr $r3, 14, $noreg
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14, $noreg
+
+  bb.3.bb27:
+    $sp = tADDspi $sp, 1, 14, $noreg
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
new file mode 100644
index 000000000000..fd6345693e83
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
@@ -0,0 +1,214 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+--- |
+  ; Function Attrs: nofree norecurse nounwind
+  define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i16 zeroext %mask) local_unnamed_addr #0 {
+  bb:
+    %tmp = icmp eq i32 %arg2, 0
+    %tmp1 = add i32 %arg2, 3
+    %tmp2 = lshr i32 %tmp1, 2
+    %tmp3 = shl nuw i32 %tmp2, 2
+    %tmp4 = add i32 %tmp3, -4
+    %tmp5 = lshr i32 %tmp4, 2
+    %tmp6 = add nuw nsw i32 %tmp5, 1
+    %conv.mask = zext i16 %mask to i32
+    %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
+    br i1 %tmp, label %bb27, label %bb3
+
+  bb3:                                              ; preds = %bb
+    call void @llvm.set.loop.iterations.i32(i32 %tmp6)
+    br label %bb9
+
+  bb9:                                              ; preds = %bb9, %bb3
+    %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ]
+    %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
+    %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
+    %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
+    %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+    %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+    %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
+    %and = and <4 x i1> %vctp, %invariant.mask
+    %tmp11 = sub i32 %tmp8, 4
+    %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
+    %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
+    %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
+    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and), !tbaa !3
+    %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
+    %tmp13 = icmp ne i32 %tmp12, 0
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+    br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7
+
+  bb27:                                             ; preds = %bb9, %bb
+    ret void
+  }
+  ; Function Attrs: argmemonly nounwind readonly willreturn
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
+  ; Function Attrs: argmemonly nounwind willreturn
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #5
+
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" }
+  attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" }
+  attributes #3 = { noduplicate nounwind "target-features"="+mve" }
+  attributes #4 = { nounwind readnone "target-features"="+mve" }
+  attributes #5 = { nounwind }
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"min_enum_size", i32 4}
+  !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"int", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = distinct !{!7, !8}
+  !8 = !{!"llvm.loop.isvectorized", i32 1}
+
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       12
+  offsetAdjustment: -4
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.bb:
+  ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   $r7 = frame-setup tMOVr $sp, 14, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+  ; CHECK:   tCBZ $r2, %bb.3
+  ; CHECK: bb.1.bb3:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   $vpr = VMSR_P0 killed $r3, 14, $noreg
+  ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+  ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r2
+  ; CHECK: bb.2.bb9:
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+  ; CHECK:   MVE_VPST 4, implicit $vpr
+  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+  ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+  ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+  ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
+  ; CHECK: bb.3.bb27:
+  ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  bb.0.bb:
+    successors: %bb.3(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $r3, $lr
+
+    frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+    tCBZ $r2, %bb.3
+
+  bb.1.bb3:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+    $vpr = VMSR_P0 killed $r3, 14, $noreg
+    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+    VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+    $r3 = tMOVr $r0, 14, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.2.bb9:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $r0, $r1, $r2, $r3
+
+    renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
+    MVE_VPST 2, implicit $vpr
+    renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+    renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+    renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
+    renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    MVE_VPST 8, implicit $vpr
+    MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    $r0 = tMOVr $r3, 14, $noreg
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14, $noreg
+
+  bb.3.bb27:
+    $sp = tADDspi $sp, 1, 14, $noreg
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+...