[llvm] 0efc9e5 - [ARM][MVE] More MVETailPredication debug messages. NFC.
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 6 01:58:37 PST 2020
Author: Sjoerd Meijer
Date: 2020-01-06T09:56:02Z
New Revision: 0efc9e5a8cc12b9cb30adf2a3dbb14ffbc60e338
URL: https://github.com/llvm/llvm-project/commit/0efc9e5a8cc12b9cb30adf2a3dbb14ffbc60e338
DIFF: https://github.com/llvm/llvm-project/commit/0efc9e5a8cc12b9cb30adf2a3dbb14ffbc60e338.diff
LOG: [ARM][MVE] More MVETailPredication debug messages. NFC.
I've added a few more debug messages to MVETailPredication because I wanted to
trace better which instructions are added/removed. And while I was at it, I
factored out one function which I thought was clearer, and have added some
comments to describe better the flow between MVETailPredication and
ARMLowOverheadLoops.
Differential Revision: https://reviews.llvm.org/D71549
Added:
Modified:
llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
llvm/lib/Target/ARM/MVETailPredication.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 136f7d7e8de0..31a98d86a54d 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -15,6 +15,10 @@
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
+/// In addition to this, we also look for the presence of the VCTP instruction,
+/// which determines whether we can generated the tail-predicated low-overhead
+/// loop form.
+///
//===----------------------------------------------------------------------===//
#include "ARM.h"
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 24bbc6236a4e..038c68739cdf 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -20,6 +20,11 @@
/// - A tail-predicated loop, with implicit predication.
/// - A loop containing multiple VCPT instructions, predicating multiple VPT
/// blocks of instructions operating on
diff erent vector types.
+///
+/// This pass inserts the inserts the VCTP intrinsic to represent the effect of
+/// tail predication. This will be picked up by the ARM Low-overhead loop pass,
+/// which performs the final transformation to a DLSTP or WLSTP tail-predicated
+/// loop.
#include "ARM.h"
#include "ARMSubtarget.h"
@@ -86,6 +91,12 @@ class MVETailPredication : public LoopPass {
/// Is the icmp that generates an i1 vector, based upon a loop counter
/// and a limit that is defined outside the loop.
bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+
+ /// Insert the intrinsic to represent the effect of tail predication.
+ void InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy,
+ Value *NumElements);
};
} // end namespace
@@ -124,7 +135,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// The MVE and LOB extensions are combined to enable tail-predication, but
// there's nothing preventing us from generating VCTP instructions for v8.1m.
if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
- LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n");
return false;
}
@@ -149,7 +160,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// Look for the hardware loop intrinsic that sets the iteration count.
IntrinsicInst *Setup = FindLoopIterations(Preheader);
- // The test.set iteration could live in the pre- preheader.
+ // The test.set iteration could live in the pre-preheader.
if (!Setup) {
if (!Preheader->getSinglePredecessor())
return false;
@@ -172,11 +183,9 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (!Decrement)
return false;
- LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
- << *Setup << "\n"
+ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
- bool Changed = TryConvert(Setup->getArgOperand(0));
- return Changed;
+ return TryConvert(Setup->getArgOperand(0));
}
bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
@@ -235,7 +244,7 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
return false;
// Now back to searching inside the loop body...
- // Find the add with takes the index iv and adds a constant vector to it.
+ // Find the add with takes the index iv and adds a constant vector to it.
Instruction *BroadcastSplat = nullptr;
Constant *Const = nullptr;
if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
@@ -270,14 +279,14 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
if (!match(OnEntry, m_Zero()))
return false;
-
+
Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
Instruction *LHS = nullptr;
if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
return false;
-
+
return LHS == Phi;
}
@@ -299,7 +308,7 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
// TODO: Can we support vectors larger than 128-bits?
- unsigned MaxWidth = TTI->getRegisterBitWidth(true);
+ unsigned MaxWidth = TTI->getRegisterBitWidth(true);
if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
@@ -400,19 +409,25 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
// tail predicated loop.
static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
SetVector<Instruction*> &MaybeDead, Loop *L) {
- if (BasicBlock *Exit = L->getUniqueExitBlock()) {
- for (auto &Pair : NewPredicates) {
- Instruction *OldPred = Pair.first;
- Instruction *NewPred = Pair.second;
-
- for (auto &I : *Exit) {
- if (I.isSameOperationAs(OldPred)) {
- Instruction *PredClone = NewPred->clone();
- PredClone->insertBefore(&I);
- I.replaceAllUsesWith(PredClone);
- MaybeDead.insert(&I);
- break;
- }
+ BasicBlock *Exit = L->getUniqueExitBlock();
+ if (!Exit) {
+ LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
+ return;
+ }
+
+ for (auto &Pair : NewPredicates) {
+ Instruction *OldPred = Pair.first;
+ Instruction *NewPred = Pair.second;
+
+ for (auto &I : *Exit) {
+ if (I.isSameOperationAs(OldPred)) {
+ Instruction *PredClone = NewPred->clone();
+ PredClone->insertBefore(&I);
+ I.replaceAllUsesWith(PredClone);
+ MaybeDead.insert(&I);
+ LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
+ dbgs() << "ARM TP: with: "; PredClone->dump());
+ break;
}
}
}
@@ -433,23 +448,69 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
Dead.insert(I);
}
- for (auto *I : Dead)
+ for (auto *I : Dead) {
+ LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
I->eraseFromParent();
+ }
for (auto I : L->blocks())
DeleteDeadPHIs(I);
}
+void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy, Value *NumElements) {
+ IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+ Module *M = L->getHeader()->getModule();
+ Type *Ty = IntegerType::get(M->getContext(), 32);
+
+ // Insert a phi to count the number of elements processed by the loop.
+ PHINode *Processed = Builder.CreatePHI(Ty, 2);
+ Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+ // Insert the intrinsic to represent the effect of tail predication.
+ Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ ConstantInt *Factor =
+ ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+
+ Intrinsic::ID VCTPID;
+ switch (VecTy->getNumElements()) {
+ default:
+ llvm_unreachable("unexpected number of lanes");
+ case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
+
+ // FIXME: vctp64 currently not supported because the predicate
+ // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
+ // type, so problems happen at isel time.
+ // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
+ // purposes, but takes a v4i1 instead of a v2i1.
+ }
+ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+ Predicate->replaceAllUsesWith(TailPredicate);
+ NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+ // Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
+ Processed->addIncoming(Remaining, L->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
+ << *Processed << "\n"
+ << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
+}
+
bool MVETailPredication::TryConvert(Value *TripCount) {
- if (!IsPredicatedVectorLoop())
+ if (!IsPredicatedVectorLoop()) {
+ LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
return false;
+ }
- LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
// Walk through the masked intrinsics and try to find whether the predicate
// operand is generated from an induction variable.
- Module *M = L->getHeader()->getModule();
- Type *Ty = IntegerType::get(M->getContext(), 32);
SetVector<Instruction*> Predicates;
DenseMap<Instruction*, Instruction*> NewPredicates;
@@ -466,48 +527,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
continue;
if (!isTailPredicate(Predicate, NumElements)) {
- LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
continue;
}
- LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
Predicates.insert(Predicate);
- // Insert a phi to count the number of elements processed by the loop.
- IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
- PHINode *Processed = Builder.CreatePHI(Ty, 2);
- Processed->addIncoming(NumElements, L->getLoopPreheader());
-
- // Insert the intrinsic to represent the effect of tail predication.
- Builder.SetInsertPoint(cast<Instruction>(Predicate));
- ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
- Intrinsic::ID VCTPID;
- switch (VecTy->getNumElements()) {
- default:
- llvm_unreachable("unexpected number of lanes");
- case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
- case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
- case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
-
- // FIXME: vctp64 currently not supported because the predicate
- // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
- // type, so problems happen at isel time.
- // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
- // purposes, but takes a v4i1 instead of a v2i1.
- }
- Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
- Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
- Predicate->replaceAllUsesWith(TailPredicate);
- NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
-
- // Add the incoming value to the new phi.
- // TODO: This add likely already exists in the loop.
- Value *Remaining = Builder.CreateSub(Processed, Factor);
- Processed->addIncoming(Remaining, L->getLoopLatch());
- LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
- << *Processed << "\n"
- << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+ InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
}
// Now clean up.
More information about the llvm-commits
mailing list