[llvm] af45907 - [ARM][MVE] Tail-predication: clean-up of unused code

Tue Jun 30 09:12:04 PDT 2020

Author: Sjoerd Meijer
Date: 2020-06-30T17:09:36+01:00
New Revision: af45907653fd312264632b616eff0fad1ae1eb2e

URL: https://github.com/llvm/llvm-project/commit/af45907653fd312264632b616eff0fad1ae1eb2e
DIFF: https://github.com/llvm/llvm-project/commit/af45907653fd312264632b616eff0fad1ae1eb2e.diff

LOG: [ARM][MVE] Tail-predication: clean-up of unused code

After the rewrite of this pass (D79175) I missed one thing: the inserted VCTP
intrinsic can be cloned to exit blocks if there are instructions present in it
that perform the same operation, but this wasn't triggering anymore. However,
it turns out that for handling reductions, see D75533, it's actually easier not
not to have the VCTP in exit blocks, so this removes that code.

This was possible because it turned out that some other code that depended on
this, rematerialization of the trip count enabling more dead code removal
later, wasn't doing much anymore due to more aggressive dead code removal that
was added to the low-overhead loops pass.

Differential Revision: https://reviews.llvm.org/D82773

Added: 
    

Modified: 
    llvm/lib/Target/ARM/MVETailPredication.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 311b6efed3e3..42a659bdf4bd 100644

--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -79,14 +79,9 @@ namespace {
 class MVETailPredication : public LoopPass {
   SmallVector<IntrinsicInst*, 4> MaskedInsts;
   Loop *L = nullptr;
-  LoopInfo *LI = nullptr;
-  const DataLayout *DL;
-  DominatorTree *DT = nullptr;
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
   const ARMSubtarget *ST = nullptr;
-  TargetLibraryInfo *TLI = nullptr;
-  bool ClonedVCTPInExitBlock = false;
 
 public:
   static char ID;
@@ -98,8 +93,6 @@ class MVETailPredication : public LoopPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.setPreservesCFG();
   }
@@ -123,8 +116,7 @@ class MVETailPredication : public LoopPass {
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy,
-                           DenseMap<Instruction *, Instruction *> &NewPredicates);
+                           FixedVectorType *VecTy);
 
   /// Rematerialize the iteration count in exit blocks, which enables
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -153,16 +145,6 @@ static bool IsMasked(Instruction *I) {
   return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
 }
 
-void MVETailPredication::RematerializeIterCount() {
-  SmallVector<WeakTrackingVH, 16> DeadInsts;
-  SCEVExpander Rewriter(*SE, *DL, "mvetp");
-  ReplaceExitVal ReplaceExitValue = AlwaysRepl;
-
-  formLCSSARecursively(*L, *DT, LI, SE);
-  rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
-                        DeadInsts);
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || DisableTailPredication)
     return false;
@@ -172,13 +154,8 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   auto &TPC = getAnalysis<TargetPassConfig>();
   auto &TM = TPC.getTM<TargetMachine>();
   ST = &TM.getSubtarget<ARMSubtarget>(F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
-  DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
@@ -232,7 +209,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (!Decrement)
     return false;
 
-  ClonedVCTPInExitBlock = false;
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
@@ -241,8 +217,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
     return false;
   }
 
-  if (ClonedVCTPInExitBlock)
-    RematerializeIterCount();
   return true;
 }
 
@@ -319,32 +293,11 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
 // in the block. This means that the VPR doesn't have to be live into the
 // exit block which should make it easier to convert this loop into a proper
 // tail predicated loop.
-static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
-                    SetVector<Instruction*> &MaybeDead, Loop *L) {
+static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
   BasicBlock *Exit = L->getUniqueExitBlock();
   if (!Exit) {
     LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
-    return false;
-  }
-
-  bool ClonedVCTPInExitBlock = false;
-
-  for (auto &Pair : NewPredicates) {
-    Instruction *OldPred = Pair.first;
-    Instruction *NewPred = Pair.second;
-
-    for (auto &I : *Exit) {
-      if (I.isSameOperationAs(OldPred)) {
-        Instruction *PredClone = NewPred->clone();
-        PredClone->insertBefore(&I);
-        I.replaceAllUsesWith(PredClone);
-        MaybeDead.insert(&I);
-        ClonedVCTPInExitBlock = true;
-        LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
-                   dbgs() << "ARM TP: with:      "; PredClone->dump());
-        break;
-      }
-    }
+    return;
   }
 
   // Drop references and add operands to check for dead.
@@ -369,8 +322,6 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
 
   for (auto I : L->blocks())
     DeleteDeadPHIs(I);
-
-  return ClonedVCTPInExitBlock;
 }
 
 // The active lane intrinsic has this form:
@@ -549,8 +500,7 @@ static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
 }
 
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy,
-    DenseMap<Instruction*, Instruction*> &NewPredicates) {
+    Value *TripCount, FixedVectorType *VecTy) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
@@ -591,7 +541,6 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
   Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
   ActiveLaneMask->replaceAllUsesWith(VCTPCall);
-  NewPredicates[ActiveLaneMask] = cast<Instruction>(VCTPCall);
 
   // Add the incoming value to the new phi.
   // TODO: This add likely already exists in the loop.
@@ -609,9 +558,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
   }
 
   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-
   SetVector<Instruction*> Predicates;
-  DenseMap<Instruction*, Instruction*> NewPredicates;
 
   // Walk through the masked intrinsics and try to find whether the predicate
   // operand is generated by intrinsic @llvm.get.active.lane.mask().
@@ -636,11 +583,10 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
       return false;
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
-    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates);
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
   }
 
-  // Now clean up.
-  ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
+  Cleanup(Predicates, L);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index 1d766f378fc5..aaeae75e072f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -15,8 +15,7 @@
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
 
 ; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]])
-; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
+; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
 ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
 define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
@@ -64,16 +63,12 @@ vector.body:                                      ; preds = %vector.body, %vecto
   br i1 %11, label %vector.body, label %middle.block
 
 middle.block:                                     ; preds = %vector.body
-; TODO: check that the intrinsic is also emitted here by the loop vectoriser
-;  %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
-  %12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
-
-  %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
-  %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
+  %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
+  %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
-  %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ]
   ret i32 %res.0.lcssa
 }