[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 2 04:43:56 PST 2025
================
@@ -939,3 +988,359 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
report_fatal_error("Loops must remain in LCSSA form!");
}
}
+
+bool LoopIdiomVectorize::recognizeFindFirstByte() {
+ // Currently the transformation only works on scalable vector types, although
+ // there is no fundamental reason why it cannot be made to work for fixed
+ // vectors too.
+ if (!TTI->supportsScalableVectors() || DisableFindFirstByte)
+ return false;
+
+ // Define some constants we need throughout.
+ BasicBlock *Header = CurLoop->getHeader();
+ LLVMContext &Ctx = Header->getContext();
+
+ // We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
+ // and OuterBB. For now, we will bail our for almost anything else. The Four
+ // blocks contain one nested loop.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
+ CurLoop->getSubLoops().size() != 1)
+ return false;
+
+ auto *InnerLoop = CurLoop->getSubLoops().front();
+ PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
+ if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
+ return false;
+
+ // Check instruction counts.
+ auto LoopBlocks = CurLoop->getBlocks();
+ if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[1]->sizeWithoutDebug() > 4 ||
+ LoopBlocks[2]->sizeWithoutDebug() > 3 ||
+ LoopBlocks[3]->sizeWithoutDebug() > 3)
+ return false;
+
+ // Check that no instruction other than IndPhi has outside uses.
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != IndPhi)
+ for (User *U : I.users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ return false;
+
+ // Match the branch instruction in the header. We are expecting an
+ // unconditional branch to the inner loop.
+ //
+ // Header:
+ // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
+ // %15 = load i8, ptr %14, align 1
+ // br label %MatchBB
+ BasicBlock *MatchBB;
+ if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
+ !InnerLoop->contains(MatchBB))
+ return false;
+
+ // MatchBB should be the entrypoint into the inner loop containing the
+ // comparison between a search element and a needle.
+ //
+ // MatchBB:
+ // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
+ // %21 = load i8, ptr %20, align 1
+ // %22 = icmp eq i8 %15, %21
+ // br i1 %22, label %ExitSucc, label %InnerBB
+ BasicBlock *ExitSucc, *InnerBB;
+ Value *LoadA, *LoadB;
+ ICmpInst::Predicate MatchPred;
+ if (!match(MatchBB->getTerminator(),
+ m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)),
+ m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
+ MatchPred != ICmpInst::Predicate::ICMP_EQ ||
+ !InnerLoop->contains(InnerBB))
+ return false;
+
+ // We expect outside uses of `IndPhi' in ExitSucc (and only there).
+ for (User *U : IndPhi->users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ if (auto *PN = dyn_cast<PHINode>(U); !PN || PN->getParent() != ExitSucc)
+ return false;
+
+ // Match the loads and check they are simple.
+ Value *A, *B;
+ if (!match(LoadA, m_Load(m_Value(A))) || !cast<LoadInst>(LoadA)->isSimple() ||
+ !match(LoadB, m_Load(m_Value(B))) || !cast<LoadInst>(LoadB)->isSimple())
+ return false;
+
+ // Check we are loading valid characters.
+ Type *CharTy = LoadA->getType();
+ if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy)
+ return false;
+
+ // Pick the vectorisation factor based on CharTy, work out the cost of the
+ // match intrinsic and decide if we should use it.
+ // Note: For the time being we assume 128-bit vectors.
+ unsigned VF = 128 / CharTy->getIntegerBitWidth();
+ SmallVector<Type *> Args = {
+ ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
+ ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)};
+ IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
+ Args);
+ if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
+ return false;
+
+ // The loads come from two PHIs, each with two incoming values.
+ PHINode *PNA = dyn_cast<PHINode>(A);
+ PHINode *PNB = dyn_cast<PHINode>(B);
+ if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB ||
+ PNB->getNumIncomingValues() != 2)
+ return false;
+
+ // One PHI comes from the outer loop (PNA), the other one from the inner loop
+ // (PNB). PNA effectively corresponds to IndPhi.
+ if (InnerLoop->contains(PNA))
+ std::swap(PNA, PNB);
----------------
rj-jesus wrote:
Thanks, I've renamed them to Search/Needle as appropriate as I had already done in expand/transformFindFirstByte.
https://github.com/llvm/llvm-project/pull/101976
More information about the llvm-commits
mailing list