[llvm] b4fa884 - [ARM] Improve VPT predicate tracking

Tue Sep 22 02:40:44 PDT 2020

Author: Sam Parker
Date: 2020-09-22T10:40:27+01:00
New Revision: b4fa884a73c5c883723738f67ad1810a6d48cc2d

URL: https://github.com/llvm/llvm-project/commit/b4fa884a73c5c883723738f67ad1810a6d48cc2d
DIFF: https://github.com/llvm/llvm-project/commit/b4fa884a73c5c883723738f67ad1810a6d48cc2d.diff

LOG: [ARM] Improve VPT predicate tracking

The VPTBlock has been modified to track the 'global' state of the
VPR, as well as the state for each block. Each object now just holds
a list of instructions that makeup the block, while static structures
hold the predicate information. This enables global access for
querying how both a VPT block and individual instructions are
predicated. These changes now allow us, again, to handle more
complicated cases where multiple instructions build a predicate
and/or where the same predicate in used in multiple blocks.

It doesn't, however, get us back to before the tracking was 'fixed'
as some extra logic will be required to properly handle VPT
instructions. Currently a VPT could be effectively predicated because
of it's inputs, but the existing logic will not detect that and so
will refuse to perform the transformation. This can be seen in
remat-vctp.ll test where we still don't perform the transform.

Differential Revision: https://reviews.llvm.org/D87681

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index bad5e6c70655..4452a6c1f1d1 100644

--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -166,62 +166,136 @@ namespace {
     }
   };
 
-  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
-  // and has a maximum of four proceeding instructions. All instructions within
-  // the block are predicated upon the vpr and we allow instructions to define
-  // the vpr within in the block too.
-  class VPTBlock {
-    // The predicate then instruction, which is either a VPT, or a VPST
-    // instruction.
-    std::unique_ptr<PredicatedMI> PredicateThen;
-    PredicatedMI *Divergent = nullptr;
-    SmallVector<PredicatedMI, 4> Insts;
+  // Represent the current state of the VPR and hold all instances which
+  // represent a VPT block, which is a list of instructions that begins with a
+  // VPT/VPST and has a maximum of four proceeding instructions. All
+  // instructions within the block are predicated upon the vpr and we allow
+  // instructions to define the vpr within in the block too.
+  class VPTState {
+    friend struct LowOverheadLoop;
+
+    SmallVector<MachineInstr *, 4> Insts;
+
+    static SmallVector<VPTState, 4> Blocks;
+    static SetVector<MachineInstr *> CurrentPredicates;
+    static std::map<MachineInstr *,
+      std::unique_ptr<PredicatedMI>> PredicatedInsts;
+
+    static void CreateVPTBlock(MachineInstr *MI) {
+      assert(CurrentPredicates.size() && "Can't begin VPT without predicate");
+      Blocks.emplace_back(MI);
+      // The execution of MI is predicated upon the current set of instructions
+      // that are AND'ed together to form the VPR predicate value. In the case
+      // that MI is a VPT, CurrentPredicates will also just be MI.
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+    }
 
-  public:
-    VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
+    static void reset() {
+      Blocks.clear();
+      PredicatedInsts.clear();
+      CurrentPredicates.clear();
     }
 
-    void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_
diff erence(Preds, PredicateThen->Predicates).empty()) {
-        Divergent = &Insts.back();
-        LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
-      }
-      Insts.emplace_back(MI, Preds);
-      assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+    static void addInst(MachineInstr *MI) {
+      Blocks.back().insert(MI);
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
     }
 
+    static void addPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
+      CurrentPredicates.insert(MI);
+    }
+
+    static void resetPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
+      CurrentPredicates.clear();
+      CurrentPredicates.insert(MI);
+    }
+
+  public:
     // Have we found an instruction within the block which defines the vpr? If
     // so, not all the instructions in the block will have the same predicate.
-    bool HasNonUniformPredicate() const {
-      return Divergent != nullptr;
+    static bool hasUniformPredicate(VPTState &Block) {
+      return getDivergent(Block) == nullptr;
+    }
+
+    // If it exists, return the first internal instruction which modifies the
+    // VPR.
+    static MachineInstr *getDivergent(VPTState &Block) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      for (unsigned i = 1; i < Insts.size(); ++i) {
+        MachineInstr *Next = Insts[i];
+        if (isVectorPredicate(Next))
+          return Next; // Found an instruction altering the vpr.
+      }
+      return nullptr;
     }
 
-    // Is the given instruction part of the predicate set controlling the entry
-    // to the block.
-    bool IsPredicatedOn(MachineInstr *MI) const {
-      return PredicateThen->Predicates.count(MI);
+    // Return whether the given instruction is predicated upon a VCTP.
+    static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
+      SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
+      if (Exclusive && Predicates.size() != 1)
+        return false;
+      for (auto *PredMI : Predicates)
+        if (isVCTP(PredMI))
+          return true;
+      return false;
+    }
+
+    // Is the VPST, controlling the block entry, predicated upon a VCTP.
+    static bool isEntryPredicatedOnVCTP(VPTState &Block,
+                                        bool Exclusive = false) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      return isPredicatedOnVCTP(Insts.front(), Exclusive);
+    }
+
+    static bool isValid() {
+      // All predication within the loop should be based on vctp. If the block
+      // isn't predicated on entry, check whether the vctp is within the block
+      // and that all other instructions are then predicated on it.
+      for (auto &Block : Blocks) {
+        if (isEntryPredicatedOnVCTP(Block))
+          continue;
+
+        SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+        for (auto *MI : Insts) {
+          // Check that any internal VCTPs are 'Then' predicated.
+          if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
+            return false;
+          // Skip other instructions that build up the predicate.
+          if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
+            continue;
+          // Check that any other instructions are predicated upon a vctp.
+          // TODO: We could infer when VPTs are implicitly predicated on the
+          // vctp (when the operands are predicated).
+          if (!isPredicatedOnVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
+            return false;
+          }
+        }
+      }
+      return true;
     }
 
-    // Returns true if this is a VPT instruction.
-    bool isVPT() const { return !isVPST(); }
+    VPTState(MachineInstr *MI) { Insts.push_back(MI); }
 
-    // Returns true if this is a VPST instruction.
-    bool isVPST() const {
-      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
+    void insert(MachineInstr *MI) {
+      Insts.push_back(MI);
+      // VPT/VPST + 4 predicated instructions.
+      assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
     }
 
-    // Is the given instruction the only predicate which controls the entry to
-    // the block.
-    bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
+    bool containsVCTP() const {
+      for (auto *MI : Insts)
+        if (isVCTP(MI))
+          return true;
+      return false;
     }
 
     unsigned size() const { return Insts.size(); }
-    SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
-    PredicatedMI *getDivergent() const { return Divergent; }
+    SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
   };
 
   struct LowOverheadLoop {
@@ -237,12 +311,8 @@ namespace {
     MachineInstr *Start = nullptr;
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
-    MachineInstr *VCTP = nullptr;
     MachineOperand TPNumElements;
-    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
-    VPTBlock *CurrentBlock = nullptr;
-    SetVector<MachineInstr*> CurrentPredicate;
-    SmallVector<VPTBlock, 4> VPTBlocks;
+    SmallVector<MachineInstr*, 4> VCTPs;
     SmallPtrSet<MachineInstr*, 4> ToRemove;
     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
     bool Revert = false;
@@ -258,6 +328,7 @@ namespace {
         Preheader = MBB;
       else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
         Preheader = MBB;
+      VPTState::reset();
     }
 
     // If this is an MVE instruction, check that we know how to use tail
@@ -272,10 +343,14 @@ namespace {
     bool IsTailPredicationLegal() const {
       // For now, let's keep things really simple and only support a single
       // block for tail predication.
-      return !Revert && FoundAllComponents() && VCTP &&
+      return !Revert && FoundAllComponents() && !VCTPs.empty() &&
              !CannotTailPredicate && ML.getNumBlocks() == 1;
     }
 
+    // Given that MI is a VCTP, check that is equivalent to any other VCTPs
+    // found.
+    bool AddVCTP(MachineInstr *MI);
+
     // Check that the predication in the loop will be equivalent once we
     // perform the conversion. Also ensure that we can provide the number
     // of elements to the loop start instruction.
@@ -298,7 +373,9 @@ namespace {
       return Start && Dec && End;
     }
 
-    SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+    SmallVectorImpl<VPTState> &getVPTBlocks() {
+      return VPTState::Blocks;
+    }
 
     // Return the operand for the loop start instruction. This will be the loop
     // iteration count, or the number of elements if we're tail predicating.
@@ -311,14 +388,18 @@ namespace {
       if (!IsTailPredicationLegal())
         return IsDo ? ARM::t2DLS : ARM::t2WLS;
 
-      return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+      return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
     }
 
     void dump() const {
       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
-      if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+      if (!VCTPs.empty()) {
+        dbgs() << "ARM Loops: Found VCTP(s):\n";
+        for (auto *MI : VCTPs)
+          dbgs() << " - " << *MI;
+      }
       if (!FoundAllComponents())
         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
       else if (!(Start && Dec && End))
@@ -382,6 +463,11 @@ namespace {
 
 char ARMLowOverheadLoops::ID = 0;
 
+SmallVector<VPTState, 4> VPTState::Blocks;
+SetVector<MachineInstr *> VPTState::CurrentPredicates;
+std::map<MachineInstr *,
+         std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
+
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                 false, false)
 
@@ -419,38 +505,10 @@ MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
 }
 
 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
-  assert(VCTP && "VCTP instruction expected but is not set");
-  // All predication within the loop should be based on vctp. If the block
-  // isn't predicated on entry, check whether the vctp is within the block
-  // and that all other instructions are then predicated on it.
-  for (auto &Block : VPTBlocks) {
-    if (Block.IsPredicatedOn(VCTP))
-      continue;
-    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                        << *Block.getDivergent()->MI);
-      return false;
-    }
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    for (auto &PredMI : Insts) {
-      // Check the instructions in the block and only allow:
-      //   - VCTPs
-      //   - Instructions predicated on the main VCTP
-      //   - Any VCMP
-      //      - VCMPs just "and" their result with VPR.P0. Whether they are
-      //      located before/after the VCTP is irrelevant - the end result will
-      //      be the same in both cases, so there's no point in requiring them
-      //      to be located after the VCTP!
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
-          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
-        continue;
-      LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
-                 << " - which is predicated on:\n";
-                 for (auto *MI : PredMI.Predicates)
-                   dbgs() << "   - " << *MI);
-      return false;
-    }
-  }
+  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
+
+  if (!VPTState::isValid())
+    return false;
 
   if (!ValidateLiveOuts()) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
@@ -461,6 +519,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
+  MachineInstr *VCTP = VCTPs.back();
   TPNumElements = VCTP->getOperand(1);
   Register NumElements = TPNumElements.getReg();
 
@@ -557,10 +616,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(),
                                              VCTP->getOperand(1).getReg())) {
     SmallPtrSet<MachineInstr*, 2> ElementChain;
-    SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+    SmallPtrSet<MachineInstr*, 2> Ignore;
     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
 
-    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+    Ignore.insert(VCTPs.begin(), VCTPs.end());
 
     if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
       bool FoundSub = false;
@@ -853,7 +912,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
 
   if (!IsTailPredicationLegal()) {
-    LLVM_DEBUG(if (!VCTP)
+    LLVM_DEBUG(if (VCTPs.empty())
                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
     return;
@@ -866,6 +925,26 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
 }
 
+bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
+  if (VCTPs.empty()) {
+    VCTPs.push_back(MI);
+    return true;
+  }
+
+  // If we find another VCTP, check whether it uses the same value as the main VCTP.
+  // If it does, store it in the VCTPs set, else refuse it.
+  MachineInstr *Prev = VCTPs.back();
+  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+      !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg())) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a 
diff erent reaching "
+                         "definition from the main VCTP");
+    return false;
+  }
+  VCTPs.push_back(MI);
+  return true;
+}
+
 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   if (CannotTailPredicate)
     return false;
@@ -886,75 +965,28 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
     return false;
   }
 
-  if (isVCTP(MI)) {
-    // If we find another VCTP, check whether it uses the same value as the main VCTP.
-    // If it does, store it in the SecondaryVCTPs set, else refuse it.
-    if (VCTP) {
-      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
-          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a 
diff erent reaching "
-                             "definition from the main VCTP");
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
-      SecondaryVCTPs.insert(MI);
-    } else {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
-      VCTP = MI;
-    }
-  } else if (isVPTOpcode(MI->getOpcode())) {
-    if (MI->getOpcode() != ARM::MVE_VPST) {
-      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
-             "VPT does not implicitly define VPR?!");
-      CurrentPredicate.clear();
-      CurrentPredicate.insert(MI);
-    }
-
-    VPTBlocks.emplace_back(MI, CurrentPredicate);
-    CurrentBlock = &VPTBlocks.back();
-    return true;
-  }
+  // Record all VCTPs and check that they're equivalent to one another.
+  if (isVCTP(MI) && !AddVCTP(MI))
+    return false;
 
   // Inspect uses first so that any instructions that alter the VPR don't
   // alter the predicate upon themselves.
   const MCInstrDesc &MCID = MI->getDesc();
   bool IsUse = false;
-  bool IsDef = false;
   for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.getReg() != ARM::VPR)
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
       continue;
 
-    if (MO.isDef()) {
-      CurrentPredicate.insert(MI);
-      IsDef = true;
-    } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
-      CurrentBlock->addInst(MI, CurrentPredicate);
+    if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
+      VPTState::addInst(MI);
       IsUse = true;
-    } else {
+    } else if (MI->getOpcode() != ARM::MVE_VPST) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
       return false;
     }
   }
 
-  // If this instruction defines the VPR, update the predicate for the
-  // proceeding instructions.
-  if (IsDef) {
-    // Clear the existing predicate when we're not in VPT Active state.
-    if (!isVectorPredicated(MI))
-      CurrentPredicate.clear();
-    CurrentPredicate.insert(MI);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI);
-  }
-
-  // If we find a vpr def that is not already predicated on the vctp, we've
-  // got disjoint predicates that may not be equivalent when we do the
-  // conversion.
-  if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
-    return false;
-  }
-
   // If we find an instruction that has been marked as not valid for tail
   // predication, only allow the instruction if it's contained within a valid
   // VPT block.
@@ -968,7 +1000,26 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
 
   // If the instruction is already explicitly predicated, then the conversion
   // will be fine, but ensure that all store operations are predicated.
-  return !IsUse && MI->mayStore() ? false : true;
+  if (MI->mayStore())
+    return IsUse;
+
+  // If this instruction defines the VPR, update the predicate for the
+  // proceeding instructions.
+  if (isVectorPredicate(MI)) {
+    // Clear the existing predicate when we're not in VPT Active state,
+    // otherwise we add to it.
+    if (!isVectorPredicated(MI))
+      VPTState::resetPredicate(MI);
+    else
+      VPTState::addPredicate(MI);
+  }
+
+  // Finally once the predicate has been modified, we can start a new VPT
+  // block if necessary.
+  if (isVPTOpcode(MI->getOpcode()))
+    VPTState::CreateVPTBlock(MI);
+
+  return true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -1301,23 +1352,20 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
       llvm_unreachable("trying to unpredicate a non-predicated instruction");
   };
 
-  // There are a few scenarios which we have to fix up:
-  // 1. VPT Blocks with non-uniform predicates:
-  //    - a. When the divergent instruction is a vctp
-  //    - b. When the block uses a vpst, and is only predicated on the vctp
-  //    - c. When the block uses a vpt and (optionally) contains one or more
-  //         vctp.
-  // 2. VPT Blocks with uniform predicates:
-  //    - a. The block uses a vpst, and is only predicated on the vctp
   for (auto &Block : LoLoop.getVPTBlocks()) {
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    if (Block.HasNonUniformPredicate()) {
-      PredicatedMI *Divergent = Block.getDivergent();
-      if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the block mask of the vp(s)t will need
-        // to be recomputed.
-        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+    SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+
+    if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/true)) {
+      if (VPTState::hasUniformPredicate(Block)) {
+        // A vpt block starting with VPST, is only predicated upon vctp and has no
+        // internal vpr defs:
+        // - Remove vpst.
+        // - Unpredicate the remaining instructions.
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
+        for (unsigned i = 1; i < Insts.size(); ++i)
+          RemovePredicate(Insts[i]);
+      } else {
         // The VPT block has a non-uniform predicate but it uses a vpst and its
         // entry is guarded only by a vctp, which means we:
         // - Need to remove the original vpst.
@@ -1327,28 +1375,28 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         //   the divergent vpr def.
         // TODO: We could be producing more VPT blocks than necessary and could
         // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
-             E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+        MachineInstr *Divergent = VPTState::getDivergent(Block);
+        for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
+             E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
           RemovePredicate(&*I);
 
         // Check if the instruction defining vpr is a vcmp so it can be combined
         // with the VPST This should be the divergent instruction
-        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0
-                                 ? Divergent->MI
-                                 : nullptr;
+        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
+          ? Divergent
+          : nullptr;
 
         unsigned Size = 0;
-        auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
-        auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
+        auto E = MachineBasicBlock::reverse_iterator(Divergent);
+        auto I = MachineBasicBlock::reverse_iterator(Insts.back());
         MachineInstr *InsertAt = nullptr;
         while (I != E) {
           InsertAt = &*I;
           ++Size;
           ++I;
         }
+
         MachineInstrBuilder MIB;
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: "
-                          << *Block.getPredicateThen());
         if (VCMP) {
           // Combine the VPST and VCMP into a VPT
           MIB =
@@ -1372,51 +1420,18 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
           MIB.addImm(0);
           LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
         }
-        LoLoop.ToRemove.insert(Block.getPredicateThen());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
         LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
       }
-      // Else, if the block uses a vpt, iterate over the block, removing the
-      // extra VCTPs it may contain.
-      else if (Block.isVPT()) {
-        bool RemovedVCTP = false;
-        for (PredicatedMI &Elt : Block.getInsts()) {
-          MachineInstr *MI = Elt.MI;
-          if (isVCTP(MI)) {
-            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
-            LoLoop.ToRemove.insert(MI);
-            RemovedVCTP = true;
-            continue;
-          }
-        }
-        if (RemovedVCTP)
-          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
-      // A vpt block starting with VPST, is only predicated upon vctp and has no
-      // internal vpr defs:
-      // - Remove vpst.
-      // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
-      LoLoop.ToRemove.insert(Block.getPredicateThen());
-      for (auto &PredMI : Insts)
-        RemovePredicate(PredMI.MI);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
-  // Remove the "main" VCTP
-  LoLoop.ToRemove.insert(LoLoop.VCTP);
-  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
-  // Remove remaining secondary VCTPs
-  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
-    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
-    // The predicated ones should have already been marked for removal when
-    // visiting the VPT blocks.
-    if (LoLoop.ToRemove.insert(VCTP).second) {
-      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
-             "Removing Predicated VCTP without updating the block mask!");
-      LLVM_DEBUG(dbgs() << "    " << *VCTP);
+    } else if (Block.containsVCTP()) {
+      // The vctp will be removed, so the block mask of the vp(s)t will need
+      // to be recomputed.
+      LoLoop.BlockMasksToRecompute.insert(Insts.front());
     }
   }
+
+  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index a60ad09dd360..522cce49f75a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -464,28 +464,19 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-NEXT:    vpttt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
-; CHECK-NEXT:    vctpt.32 r3
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB5_2
+; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %bb32
 ; CHECK-NEXT:    pop {r7, pc}
 bb:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
new file mode 100644
index 000000000000..26be5328027a
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp) {
+; CHECK-LABEL: minmaxval4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    adr r3, .LCPI0_0
+; CHECK-NEXT:    mov.w lr, #3
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vmov.i32 q0, #0x80000000
+; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmov.i32 q3, #0xa
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vadd.i32 q4, q2, r2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vcmp.u32 hi, q5, q4
+; CHECK-NEXT:    adds r2, #4
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 hi, q3, q4
+; CHECK-NEXT:    vstr p0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q4, q0
+; CHECK-NEXT:    vpsel q0, q4, q0
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q1, q4
+; CHECK-NEXT:    vpsel q1, q4, q1
+; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %middle.block
+; CHECK-NEXT:    mvn r0, #-2147483648
+; CHECK-NEXT:    vminv.s32 r0, q1
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    mov.w r0, #-2147483648
+; CHECK-NEXT:    vmaxv.s32 r0, q0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %entry ], [ %5, %vector.body ]
+  %vec.phi29 = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %entry ], [ %7, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 10)
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %2 = icmp sgt <4 x i32> %wide.masked.load, %vec.phi29
+  %3 = icmp slt <4 x i32> %wide.masked.load, %vec.phi
+  %4 = and <4 x i1> %active.lane.mask, %3
+  %5 = select <4 x i1> %4, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi
+  %6 = and <4 x i1> %active.lane.mask, %2
+  %7 = select <4 x i1> %6, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi29
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, 12
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7)
+  %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5)
+  store i32 %10, i32* %minp, align 4
+  ret i32 %9
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
+declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3
+declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3
+

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
index 6df9702ca01d..c5da60f05ff7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -118,32 +118,23 @@ body:             |
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r0, $r1, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0)
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPTv4i32r 8, renamable $q0, $zr, 1, implicit-def $vpr
   ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
index 74f1e0568444..c455527b36f5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
@@ -215,26 +215,17 @@ body:             |
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr
   ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry:
@@ -731,26 +722,17 @@ body:             |
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr
   ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry: