[llvm] bee2f61 - [ARM] Introduce t2WhileLoopStartTP

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 13 05:55:49 PDT 2021


Author: David Green
Date: 2021-06-13T13:55:34+01:00
New Revision: bee2f618d599201aa0c91d0322f058cc697e0779

URL: https://github.com/llvm/llvm-project/commit/bee2f618d599201aa0c91d0322f058cc697e0779
DIFF: https://github.com/llvm/llvm-project/commit/bee2f618d599201aa0c91d0322f058cc697e0779.diff

LOG: [ARM] Introduce t2WhileLoopStartTP

This adds t2WhileLoopStartTP, similar to the t2DoLoopStartTP added in
D90591. It keeps a reference to both the tripcount register and the
element count register, so that the ARMLowOverheadLoops pass in the
backend can pick the correct one without having to search for it from
the operand of a VCTP.

Differential Revision: https://reviews.llvm.org/D103236

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/lib/Target/ARM/ARMBaseInstrInfo.h
    llvm/lib/Target/ARM/ARMBlockPlacement.cpp
    llvm/lib/Target/ARM/ARMInstrThumb2.td
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
    llvm/lib/Target/ARM/MVETailPredUtils.h
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
    llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index aff8b8e21bdd3..b55d549261506 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6122,8 +6122,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
   // Be conservative with ARMv8.1 MVE instructions.
   if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
       Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
-      Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopDec ||
-      Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec)
+      Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2WhileLoopStartTP ||
+      Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd ||
+      Opc == ARM::t2LoopEndDec)
     return outliner::InstrType::Illegal;
 
   const MCInstrDesc &MCID = MI.getDesc();

diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 6eb997ba1a13c..0ebba0d9fdd5e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -367,7 +367,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
     return MI->getOpcode() == ARM::t2LoopEndDec ||
            MI->getOpcode() == ARM::t2DoLoopStartTP ||
-           MI->getOpcode() == ARM::t2WhileLoopStartLR;
+           MI->getOpcode() == ARM::t2WhileLoopStartLR ||
+           MI->getOpcode() == ARM::t2WhileLoopStartTP;
   }
 
 private:
@@ -645,12 +646,6 @@ static inline bool isJumpTableBranchOpcode(int Opc) {
          Opc == ARM::t2BR_JT;
 }
 
-static inline bool isLowOverheadTerminatorOpcode(int Opc) {
-  return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
-         Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopEnd ||
-         Opc == ARM::t2LoopEndDec;
-}
-
 static inline
 bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;

diff  --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index 539db713f17ca..5ea47f529b230 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -15,6 +15,7 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBasicBlockInfo.h"
 #include "ARMSubtarget.h"
+#include "MVETailPredUtils.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -61,13 +62,13 @@ INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
 
 static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) {
   for (auto &Terminator : MBB->terminators()) {
-    if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR)
+    if (isWhileLoopStart(Terminator))
       return &Terminator;
   }
   return nullptr;
 }
 
-/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only
+/// Find WhileLoopStart in the loop predecessor BB or otherwise in its only
 /// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair.
 static MachineInstr *findWLS(MachineLoop *ML) {
   MachineBasicBlock *Predecessor = ML->getLoopPredecessor();
@@ -93,7 +94,7 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
     return false;
 
   MachineBasicBlock *Predecessor = WlsInstr->getParent();
-  MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB();
+  MachineBasicBlock *LoopExit = getWhileLoopStartTargetBB(*WlsInstr);
 
   // We don't want to move Preheader to before the function's entry block.
   if (!LoopExit->getPrevNode())
@@ -118,9 +119,9 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
        ++It) {
     MachineBasicBlock *MBB = &*It;
     for (auto &Terminator : MBB->terminators()) {
-      if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR)
+      if (!isWhileLoopStart(Terminator))
         continue;
-      MachineBasicBlock *WLSTarget = Terminator.getOperand(2).getMBB();
+      MachineBasicBlock *WLSTarget = getWhileLoopStartTargetBB(Terminator);
       // TODO: Analyse the blocks to make a decision if it would be worth
       // moving Preheader even if we'd introduce a backwards WLS
       if (WLSTarget == Predecessor) {

diff  --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 5f7cfa371ff10..1258c70b81f6f 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5479,8 +5479,8 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
 // t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in
 // ARMLowOverheadLoops if possible, or reverted to a Mov if not.
 def t2DoLoopStart :
-  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
-  [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc), 4, IIC_Br,
+  [(set GPRlr:$X, (int_start_loop_iterations rGPR:$tc))]>;
 
 // A pseudo for a DLSTP, created in the MVETPAndVPTOptimizationPass from a
 // t2DoLoopStart if the loops is tail predicated. Holds both the element
@@ -5488,7 +5488,7 @@ def t2DoLoopStart :
 // ARMLowOverheadLoops when it is converted to a DLSTP or DLS as required.
 let isTerminator = 1, hasSideEffects = 1 in
 def t2DoLoopStartTP :
-  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc, rGPR:$elts), 4, IIC_Br, []>;
 
 // Setup for a t2WhileLoopStart. A pair of t2WhileLoopSetup and t2WhileLoopStart
 // will be created post-ISel from a llvm.test.start.loop.iterations. This
@@ -5496,7 +5496,7 @@ def t2DoLoopStartTP :
 // valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations
 // into a t2WhileLoopStartLR (or expanded).
 def t2WhileLoopSetup :
-  t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$elts), 4, IIC_Br, []>;
+  t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>;
 
 // A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and
 // t2LoopEnd together represent a LE instruction. Ideally these are converted
@@ -5511,7 +5511,7 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
 // into a t2WhileLoopStartLR that does both the LR setup and branch.
 def t2WhileLoopStart :
     t2PseudoInst<(outs),
-                 (ins GPRlr:$elts, brtarget:$target),
+                 (ins GPRlr:$tc, brtarget:$target),
                  4, IIC_Br, []>,
                  Sched<[WriteBr]>;
 
@@ -5521,13 +5521,21 @@ def t2WhileLoopStart :
 // converted into t2CMP and t2Bcc.
 def t2WhileLoopStartLR :
     t2PseudoInst<(outs GPRlr:$lr),
-                 (ins rGPR:$elts, brtarget:$target),
+                 (ins rGPR:$tc, brtarget:$target),
+                 8, IIC_Br, []>,
+                 Sched<[WriteBr]>;
+
+// Similar to a t2DoLoopStartTP, a t2WhileLoopStartTP is a pseudo for a WLSTP
+// holding both the element count and the tripcount of the loop.
+def t2WhileLoopStartTP :
+    t2PseudoInst<(outs GPRlr:$lr),
+                 (ins rGPR:$tc, rGPR:$elts, brtarget:$target),
                  8, IIC_Br, []>,
                  Sched<[WriteBr]>;
 
 // t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair.
 def t2LoopEnd :
-  t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
+  t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target),
   8, IIC_Br, []>, Sched<[WriteBr]>;
 
 // The combination of a t2LoopDec and t2LoopEnd, performing both the LR
@@ -5535,7 +5543,7 @@ def t2LoopEnd :
 // LETP in ARMLowOverheadLoops as appropriate, or converted to t2CMP/t2Bcc
 // if the branches are out of range.
 def t2LoopEndDec :
-  t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target),
+  t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$tc, brtarget:$target),
   8, IIC_Br, []>, Sched<[WriteBr]>;
 
 } // end isBranch, isTerminator, hasSideEffects

diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index e1d77a585d285..ecdb380cc340a 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -101,10 +101,6 @@ static bool shouldInspect(MachineInstr &MI) {
   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
 }
 
-static bool isDo(MachineInstr *MI) {
-  return MI->getOpcode() != ARM::t2WhileLoopStartLR;
-}
-
 namespace {
 
   using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -446,7 +442,7 @@ namespace {
     }
 
     unsigned getStartOpcode() const {
-      bool IsDo = isDo(Start);
+      bool IsDo = isDoLoopStart(*Start);
       if (!IsTailPredicationLegal())
         return IsDo ? ARM::t2DLS : ARM::t2WLS;
 
@@ -635,7 +631,8 @@ bool LowOverheadLoop::ValidateTailPredicate() {
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
   MachineInstr *VCTP = VCTPs.back();
-  if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
+  if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
+      Start->getOpcode() == ARM::t2WhileLoopStartTP) {
     TPNumElements = Start->getOperand(2);
     StartInsertPt = Start;
     StartInsertBB = Start->getParent();
@@ -778,10 +775,12 @@ bool LowOverheadLoop::ValidateTailPredicate() {
     }
   }
 
-  // If we converted the LoopStart to a t2DoLoopStartTP, we can also remove any
-  // extra instructions in the preheader, which often includes a now unused MOV.
-  if (Start->getOpcode() == ARM::t2DoLoopStartTP && Preheader &&
-      !Preheader->empty() &&
+  // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
+  // can also remove any extra instructions in the preheader, which often
+  // includes a now unused MOV.
+  if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
+       Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
+      Preheader && !Preheader->empty() &&
       !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
     if (auto *Def = RDA.getUniqueReachingMIDef(
             &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
@@ -1045,12 +1044,13 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
       return false;
     }
 
-    if (Start->getOpcode() == ARM::t2WhileLoopStartLR &&
-        (BBUtils->getOffsetOf(Start) >
-             BBUtils->getOffsetOf(Start->getOperand(2).getMBB()) ||
-         !BBUtils->isBBInRange(Start, Start->getOperand(2).getMBB(), 4094))) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
-      return false;
+    if (isWhileLoopStart(*Start)) {
+      MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
+      if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
+          !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+        return false;
+      }
     }
     return true;
   };
@@ -1289,7 +1289,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
 // another low register.
 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
-  MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
+  MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
     ARM::tBcc : ARM::t2Bcc;
 
@@ -1426,8 +1426,8 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
 
     MIB.addDef(ARM::LR);
     MIB.add(Count);
-    if (!isDo(Start))
-      MIB.add(Start->getOperand(2));
+    if (isWhileLoopStart(*Start))
+      MIB.addMBB(getWhileLoopStartTargetBB(*Start));
 
     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
     NewStart = &*MIB;
@@ -1612,7 +1612,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
   };
 
   if (LoLoop.Revert) {
-    if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStartLR)
+    if (isWhileLoopStart(*LoLoop.Start))
       RevertWhile(LoLoop.Start);
     else
       RevertDo(LoLoop.Start);
@@ -1683,7 +1683,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
     Changed = true;
 
     for (auto *Start : Starts) {
-      if (Start->getOpcode() == ARM::t2WhileLoopStartLR)
+      if (isWhileLoopStart(*Start))
         RevertWhile(Start);
       else
         RevertDo(Start);

diff  --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 2aa5d6ad842a1..6fa5402096a6a 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -429,7 +429,8 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
   MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
   if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
     return false;
-  if (LoopDec != LoopEnd || LoopStart->getOpcode() != ARM::t2DoLoopStart)
+  if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
+                             LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
     return false;
 
   SmallVector<MachineInstr *, 4> VCTPs;
@@ -494,12 +495,16 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
       return false;
     }
 
-  MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
-                                   TII->get(ARM::t2DoLoopStartTP))
-                               .add(LoopStart->getOperand(0))
-                               .add(LoopStart->getOperand(1))
-                               .addReg(CountReg);
-  (void)MI;
+  unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
+                        ? ARM::t2DoLoopStartTP
+                        : ARM::t2WhileLoopStartTP;
+  MachineInstrBuilder MI =
+      BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
+          .add(LoopStart->getOperand(0))
+          .add(LoopStart->getOperand(1))
+          .addReg(CountReg);
+  if (NewOpc == ARM::t2WhileLoopStartTP)
+    MI.add(LoopStart->getOperand(2));
   LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << "  with "
                     << *MI.getInstr());
   MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);

diff  --git a/llvm/lib/Target/ARM/MVETailPredUtils.h b/llvm/lib/Target/ARM/MVETailPredUtils.h
index b0c003120fa5b..8c64893d448f1 100644
--- a/llvm/lib/Target/ARM/MVETailPredUtils.h
+++ b/llvm/lib/Target/ARM/MVETailPredUtils.h
@@ -68,11 +68,26 @@ static inline bool isVCTP(const MachineInstr *MI) {
   return false;
 }
 
-static inline bool isLoopStart(MachineInstr &MI) {
+static inline bool isDoLoopStart(const MachineInstr &MI) {
   return MI.getOpcode() == ARM::t2DoLoopStart ||
-         MI.getOpcode() == ARM::t2DoLoopStartTP ||
-         MI.getOpcode() == ARM::t2WhileLoopStart ||
-         MI.getOpcode() == ARM::t2WhileLoopStartLR;
+         MI.getOpcode() == ARM::t2DoLoopStartTP;
+}
+
+static inline bool isWhileLoopStart(const MachineInstr &MI) {
+  return MI.getOpcode() == ARM::t2WhileLoopStart ||
+         MI.getOpcode() == ARM::t2WhileLoopStartLR ||
+         MI.getOpcode() == ARM::t2WhileLoopStartTP;
+}
+
+static inline bool isLoopStart(const MachineInstr &MI) {
+  return isDoLoopStart(MI) || isWhileLoopStart(MI);
+}
+
+// Return the TargetBB stored in a t2WhileLoopStartLR/t2WhileLoopStartTP.
+inline MachineBasicBlock *getWhileLoopStartTargetBB(const MachineInstr &MI) {
+  assert(isWhileLoopStart(MI) && "Expected WhileLoopStart!");
+  unsigned Op = MI.getOpcode() == ARM::t2WhileLoopStartTP ? 3 : 2;
+  return MI.getOperand(Op).getMBB();
 }
 
 // WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a
@@ -84,8 +99,9 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
                                    unsigned BrOpc = ARM::t2Bcc,
                                    bool UseCmp = false) {
   MachineBasicBlock *MBB = MI->getParent();
-  assert(MI->getOpcode() == ARM::t2WhileLoopStartLR &&
-         "Only expected a t2WhileLoopStartLR in RevertWhileLoopStartLR!");
+  assert((MI->getOpcode() == ARM::t2WhileLoopStartLR ||
+          MI->getOpcode() == ARM::t2WhileLoopStartTP) &&
+         "Only expected a t2WhileLoopStartLR/TP in RevertWhileLoopStartLR!");
 
   // Subs/Cmp
   if (UseCmp) {
@@ -109,8 +125,8 @@ inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
   // Branch
   MachineInstrBuilder MIB =
       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
-  MIB.add(MI->getOperand(2)); // branch target
-  MIB.addImm(ARMCC::EQ);      // condition code
+  MIB.addMBB(getWhileLoopStartTargetBB(*MI)); // branch target
+  MIB.addImm(ARMCC::EQ);                      // condition code
   MIB.addReg(ARM::CPSR);
 
   MI->eraseFromParent();

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 8c8f67844257b..040e026e6a80a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -17,8 +17,7 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n,
 ; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
 ; CHECK-NEXT:    adds r4, r1, r7
 ; CHECK-NEXT:    adds r5, r0, r7
-; CHECK-NEXT:    mov r6, r3
-; CHECK-NEXT:    wlstp.8 lr, r6, .LBB0_3
+; CHECK-NEXT:    wlstp.8 lr, r3, .LBB0_3
 ; CHECK-NEXT:    b .LBB0_4
 ; CHECK-NEXT:  .LBB0_3: @ %for.body
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
@@ -71,8 +70,7 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    wlstp.8 lr, r3, .LBB1_3
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_3
 ; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_3: @ %for.body
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
@@ -285,8 +283,7 @@ define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
 ; CHECK-NEXT:  @ %bb.1: @ %prehead
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    wlstp.8 lr, r3, .LBB6_3
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB6_3
 ; CHECK-NEXT:  .LBB6_2: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vstrb.8 q0, [r12], #16
 ; CHECK-NEXT:    letp lr, .LBB6_2

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
index 234b112050d47..e94af93d8cfe9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
@@ -63,11 +63,11 @@ body:             |
   ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   [[t2WhileLoopStartLR:%[0-9]+]]:gprlr = t2WhileLoopStartLR killed [[t2LSRri]], %bb.3, implicit-def $cpsr
+  ; CHECK:   [[t2WhileLoopStartTP:%[0-9]+]]:gprlr = t2WhileLoopStartTP killed [[t2LSRri]], [[COPY]], %bb.3, implicit-def $cpsr
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; CHECK:   [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %11, %bb.2
-  ; CHECK:   [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartLR]], %bb.1, %13, %bb.2
+  ; CHECK:   [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartTP]], %bb.1, %13, %bb.2
   ; CHECK:   [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %15, %bb.2
   ; CHECK:   [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
   ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 21ce978ee49f2..56421bde7b605 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -634,8 +634,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r3, r0, r5, lsl #1
-; CHECK-NEXT:    mov r5, r6
-; CHECK-NEXT:    wlstp.8 lr, r5, .LBB10_4
+; CHECK-NEXT:    wlstp.8 lr, r6, .LBB10_4
 ; CHECK-NEXT:    b .LBB10_15
 ; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
index 8929e082864fd..ea1f75bbc7067 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
@@ -235,8 +235,7 @@ define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
 ; CHECK-NEXT:  .LBB10_1: @ %prehead
 ; CHECK-NEXT:    mov r12, r1
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    wlstp.8 lr, r3, .LBB10_3
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB10_3
 ; CHECK-NEXT:  .LBB10_2: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u8 q0, [r12], #16
 ; CHECK-NEXT:    vstrb.8 q0, [r4], #16
@@ -318,8 +317,7 @@ define void @twoloops(i32* %X, i32 %n, i32 %m) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    mov r3, r0
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB13_2
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB13_2
 ; CHECK-NEXT:  .LBB13_1: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vstrb.8 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB13_1
@@ -489,8 +487,7 @@ define void @multilooped_exit(i32 %b) {
 ; CHECK-NEXT:    movt r3, :upper16:arr_56
 ; CHECK-NEXT:    lsr.w r12, r1, #4
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB18_5
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_5
 ; CHECK-NEXT:  .LBB18_4: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
@@ -498,8 +495,7 @@ define void @multilooped_exit(i32 %b) {
 ; CHECK-NEXT:  .LBB18_5: @ %loop
 ; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB18_7
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_7
 ; CHECK-NEXT:  .LBB18_6: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
@@ -507,8 +503,7 @@ define void @multilooped_exit(i32 %b) {
 ; CHECK-NEXT:  .LBB18_7: @ %loop
 ; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB18_9
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_9
 ; CHECK-NEXT:  .LBB18_8: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
@@ -567,12 +562,10 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    movw r0, :lower16:arr_22
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    movt r0, :upper16:arr_22
-; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r1, r2, #15
 ; CHECK-NEXT:    lsrs r3, r1, #4
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB19_2
+; CHECK-NEXT:    strd r3, r2, [sp] @ 8-byte Folded Spill
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB19_2
 ; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vstrb.8 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB19_1
@@ -621,11 +614,12 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    le lr, .LBB19_3
 ; CHECK-NEXT:  @ %bb.4: @ %for.cond.cleanup6
 ; CHECK-NEXT:    movw r0, :lower16:arr_22
-; CHECK-NEXT:    ldrd r2, r1, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    movt r0, :upper16:arr_22
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r0, r0, #1824
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB19_6
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB19_6
 ; CHECK-NEXT:  .LBB19_5: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vstrb.8 q1, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB19_5
@@ -675,11 +669,12 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    le lr, .LBB19_7
 ; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup6.1
 ; CHECK-NEXT:    movw r0, :lower16:arr_22
-; CHECK-NEXT:    ldrd r2, r1, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    movt r0, :upper16:arr_22
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r0, r0, #3648
-; CHECK-NEXT:    wlstp.8 lr, r1, .LBB19_10
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB19_10
 ; CHECK-NEXT:  .LBB19_9: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vstrb.8 q1, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB19_9
@@ -731,19 +726,14 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    le lr, .LBB19_11
 ; CHECK-NEXT:  @ %bb.12: @ %for.cond.cleanup6.2
 ; CHECK-NEXT:    movw r0, :lower16:arr_22
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r2, r1, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    movt r0, :upper16:arr_22
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    add.w r0, r0, #5472
-; CHECK-NEXT:    wls lr, r1, .LBB19_14
+; CHECK-NEXT:    wlstp.8 lr, r1, .LBB19_14
 ; CHECK-NEXT:  .LBB19_13: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vctp.8 r1
-; CHECK-NEXT:    subs r1, #16
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.8 q1, [r0], #16
-; CHECK-NEXT:    le lr, .LBB19_13
+; CHECK-NEXT:    vstrb.8 q1, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB19_13
 ; CHECK-NEXT:  .LBB19_14: @ %for.cond.cleanup6.2
 ; CHECK-NEXT:    movw r2, :lower16:arr_21
 ; CHECK-NEXT:    movw r1, #5508


        


More information about the llvm-commits mailing list