[llvm] [LoopIdiomVectorize][NFC] Factoring out the part that handles vectorization strategy (PR #94682)

Min-Yih Hsu via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 10 10:35:55 PDT 2024


https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/94682

>From 31c6fad204a249a81059bc460202fc9e7798b84b Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 6 Jun 2024 11:57:04 -0700
Subject: [PATCH 1/2] [LoopIdiomVectorize] Remove redundant DomTreeUpdates

Because of how we insert most of our vector code between the original
preheader and a block splitted out from it, we actually don't need most
of the DTU updates as an edge deletion update is sufficient to update
the DT of the said region.

This is effectively a NFC.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 52 ++++++-------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 38095b1433ebe..f52a32fee7401 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -40,6 +40,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -391,6 +392,14 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BasicBlock *LoopIncBlock = BasicBlock::Create(
       Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
 
+  // This is actually one of the only two DTU updates we need. The reason being
+  // that we're splitting `mismatch_end` out of the preheader and put
+  // most of the stuff we create later between the preheader and
+  // `mismatch_end`. Now when DTU removes an edge, it simply recalculates
+  // everything in between. In this case, it will be the prehedaer and
+  // `mismatch_end`, along with the aforementioned content. Therefore we don't
+  // need to insert additional DTU updates for new control flow edges
+  // added in this region.
   DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
                     {DominatorTree::Delete, Preheader, EndBlock}});
 
@@ -436,10 +445,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
   Builder.Insert(MinItCheckBr);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
-       {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
-
   // For each of the arrays, check the start/end addresses are on the same
   // page.
   Builder.SetInsertPoint(MemCheckBlock);
@@ -482,10 +487,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
                                 .createBranchWeights(10, 90));
   Builder.Insert(CombinedPageCmpCmpBr);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
-       {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
-
   // Set up the vector loop preheader, i.e. calculate initial loop predicate,
   // zero-extend MaxLen to 64-bits, determine the number of vector elements
   // processed in each iteration, etc.
@@ -512,9 +513,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
   Builder.Insert(JumpToVectorLoop);
 
-  DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
-                     VectorLoopStartBlock}});
-
   // Set up the first vector loop block by creating the PHIs, doing the vector
   // loads and comparing the vectors.
   Builder.SetInsertPoint(VectorLoopStartBlock);
@@ -542,10 +540,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
   Builder.Insert(VectorEarlyExit);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
-       {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
-
   // Increment the index counter and calculate the predicate for the next
   // iteration of the loop. We branch back to the start of the loop if there
   // is at least one active lane.
@@ -565,10 +559,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
       BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
   Builder.Insert(VectorLoopBranchBack);
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
-       {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
-
   // If we found a mismatch then we need to calculate which lane in the vector
   // had a mismatch and add that on to the current loop index.
   Builder.SetInsertPoint(VectorLoopMismatchBlock);
@@ -592,16 +582,10 @@ Value *LoopIdiomVectorize::expandFindMismatch(
 
   Builder.Insert(BranchInst::Create(EndBlock));
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
-
   // Generate code for scalar loop.
   Builder.SetInsertPoint(LoopPreHeaderBlock);
   Builder.Insert(BranchInst::Create(LoopStartBlock));
 
-  DTU.applyUpdates(
-      {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
-
   Builder.SetInsertPoint(LoopStartBlock);
   PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
   IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
@@ -623,9 +607,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
   Builder.Insert(MatchCmpBr);
 
-  DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
-                    {DominatorTree::Insert, LoopStartBlock, EndBlock}});
-
   // Have we reached the maximum permitted length for the loop?
   Builder.SetInsertPoint(LoopIncBlock);
   Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
@@ -636,9 +617,6 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
   Builder.Insert(IVCmpBr);
 
-  DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
-                    {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
-
   // In the end block we need to insert a PHI node to deal with three cases:
   //  1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
   //  2. We exitted the scalar loop early due to a mismatch and need to return
@@ -679,7 +657,12 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   BasicBlock *Header = CurLoop->getHeader();
   BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
   IRBuilder<> Builder(PHBranch);
+
+  // Safeguard to check if we build the correct DomTree with DTU.
+  auto CheckDTU = llvm::make_scope_exit(
+      [this]() { assert(DT->verify() && "Ill-formed DomTree built by DTU"); });
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
 
   // Increment the pointer if this was done before the loads in the loop.
@@ -708,6 +691,9 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
   PHBranch->eraseFromParent();
 
+  // Previously we take care of the DTU updates between the preheader and
+  // `mismatch_end`. Now we need to make sure edges and blocks appended after
+  // `mismatch_end` are also being properly accounted for.
   BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
   DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
 
@@ -717,12 +703,8 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   if (FoundBB != EndBB) {
     Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
     Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
-    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
-                      {DominatorTree::Insert, CmpBB, EndBB}});
-
   } else {
     Builder.CreateBr(FoundBB);
-    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
   }
 
   auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {

>From 174d02ecc00cdf7e0226e37bcf5aa8619f01f435 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 6 Jun 2024 13:20:55 -0700
Subject: [PATCH 2/2] [LoopIdiomVectorize][NFC] Factoring out the part that
 handles vectorization strategy

To pave the way for porting LIV to RISC-V, which uses VP intrinsics for
vectors.

NFC.
---
 .../Vectorize/LoopIdiomVectorize.cpp          | 219 ++++++++++--------
 1 file changed, 123 insertions(+), 96 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index f52a32fee7401..6b6b067db30b7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -79,6 +79,13 @@ class LoopIdiomVectorize {
   const TargetTransformInfo *TTI;
   const DataLayout *DL;
 
+  // Blocks that will be used for inserting vectorized code.
+  BasicBlock *EndBlock = nullptr;
+  BasicBlock *VectorLoopPreheaderBlock = nullptr;
+  BasicBlock *VectorLoopStartBlock = nullptr;
+  BasicBlock *VectorLoopMismatchBlock = nullptr;
+  BasicBlock *VectorLoopIncBlock = nullptr;
+
 public:
   explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI,
                               const TargetTransformInfo *TTI,
@@ -96,9 +103,15 @@ class LoopIdiomVectorize {
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   bool recognizeByteCompare();
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
+
+  Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA,
+                                  GetElementPtrInst *GEPB, Value *ExtStart,
+                                  Value *ExtEnd);
+
   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
@@ -332,6 +345,106 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
   return true;
 }
 
+Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder,
+                                                    GetElementPtrInst *GEPA,
+                                                    GetElementPtrInst *GEPB,
+                                                    Value *ExtStart,
+                                                    Value *ExtEnd) {
+  Type *I64Type = Builder.getInt64Ty();
+  Type *ResType = Builder.getInt32Ty();
+  Type *LoadType = Builder.getInt8Ty();
+  Value *PtrA = GEPA->getPointerOperand();
+  Value *PtrB = GEPB->getPointerOperand();
+
+  // At this point we know two things must be true:
+  //  1. Start <= End
+  //  2. ExtMaxLen <= MinPageSize due to the page checks.
+  // Therefore, we know that we can use a 64-bit induction variable that
+  // starts from 0 -> ExtMaxLen and it will not overflow.
+  ScalableVectorType *PredVTy =
+      ScalableVectorType::get(Builder.getInt1Ty(), 16);
+
+  Value *InitialPred = Builder.CreateIntrinsic(
+      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
+
+  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
+  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
+                             /*HasNUW=*/true, /*HasNSW=*/true);
+
+  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
+                                            Builder.getInt1(false));
+
+  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  Builder.Insert(JumpToVectorLoop);
+
+  // Set up the first vector loop block by creating the PHIs, doing the vector
+  // loads and comparing the vectors.
+  Builder.SetInsertPoint(VectorLoopStartBlock);
+  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
+  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
+  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
+  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
+  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
+
+  Value *VectorLhsGep =
+      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
+  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorRhsGep =
+      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
+  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
+                                                  Align(1), LoopPred, Passthru);
+
+  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
+  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
+  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
+  BranchInst *VectorEarlyExit = BranchInst::Create(
+      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
+  Builder.Insert(VectorEarlyExit);
+
+  // Increment the index counter and calculate the predicate for the next
+  // iteration of the loop. We branch back to the start of the loop if there
+  // is at least one active lane.
+  Builder.SetInsertPoint(VectorLoopIncBlock);
+  Value *NewVectorIndexPhi =
+      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
+                        /*HasNUW=*/true, /*HasNSW=*/true);
+  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
+  Value *NewPred =
+      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
+  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
+
+  Value *PredHasActiveLanes =
+      Builder.CreateExtractElement(NewPred, uint64_t(0));
+  BranchInst *VectorLoopBranchBack =
+      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
+  Builder.Insert(VectorLoopBranchBack);
+
+  // If we found a mismatch then we need to calculate which lane in the vector
+  // had a mismatch and add that on to the current loop index.
+  Builder.SetInsertPoint(VectorLoopMismatchBlock);
+  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
+  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
+  PHINode *LastLoopPred =
+      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
+  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
+  PHINode *VectorFoundIndex =
+      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
+  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
+
+  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
+  Value *Ctz = Builder.CreateIntrinsic(
+      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
+      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
+  Ctz = Builder.CreateZExt(Ctz, I64Type);
+  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+  return Builder.CreateTrunc(VectorLoopRes64, ResType);
+}
+
 Value *LoopIdiomVectorize::expandFindMismatch(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
@@ -346,8 +459,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   Type *ResType = Builder.getInt32Ty();
 
   // Split block in the original loop preheader.
-  BasicBlock *EndBlock =
-      SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
+  EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
 
   // Create the blocks that we're going to need:
   //  1. A block for checking the zero-extended length exceeds 0
@@ -371,17 +483,17 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   BasicBlock *MemCheckBlock = BasicBlock::Create(
       Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopPreheaderBlock = BasicBlock::Create(
+  VectorLoopPreheaderBlock = BasicBlock::Create(
       Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopStartBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop", EndBlock->getParent(), EndBlock);
+  VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
+                                            EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopIncBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_inc", EndBlock->getParent(), EndBlock);
+  VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
+                                          EndBlock->getParent(), EndBlock);
 
-  BasicBlock *VectorLoopMismatchBlock = BasicBlock::Create(
-      Ctx, "mismatch_vec_loop_found", EndBlock->getParent(), EndBlock);
+  VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
+                                               EndBlock->getParent(), EndBlock);
 
   BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
       Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
@@ -492,93 +604,8 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // processed in each iteration, etc.
   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
 
-  // At this point we know two things must be true:
-  //  1. Start <= End
-  //  2. ExtMaxLen <= MinPageSize due to the page checks.
-  // Therefore, we know that we can use a 64-bit induction variable that
-  // starts from 0 -> ExtMaxLen and it will not overflow.
-  ScalableVectorType *PredVTy =
-      ScalableVectorType::get(Builder.getInt1Ty(), 16);
-
-  Value *InitialPred = Builder.CreateIntrinsic(
-      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
-
-  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
-  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
-                             /*HasNUW=*/true, /*HasNSW=*/true);
-
-  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
-                                            Builder.getInt1(false));
-
-  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
-  Builder.Insert(JumpToVectorLoop);
-
-  // Set up the first vector loop block by creating the PHIs, doing the vector
-  // loads and comparing the vectors.
-  Builder.SetInsertPoint(VectorLoopStartBlock);
-  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
-  LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
-  PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
-  VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
-  Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
-  Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
-
-  Value *VectorLhsGep =
-      Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
-  Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorRhsGep =
-      Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
-  Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
-                                                  Align(1), LoopPred, Passthru);
-
-  Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
-  VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
-  Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
-  BranchInst *VectorEarlyExit = BranchInst::Create(
-      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
-  Builder.Insert(VectorEarlyExit);
-
-  // Increment the index counter and calculate the predicate for the next
-  // iteration of the loop. We branch back to the start of the loop if there
-  // is at least one active lane.
-  Builder.SetInsertPoint(VectorLoopIncBlock);
-  Value *NewVectorIndexPhi =
-      Builder.CreateAdd(VectorIndexPhi, VecLen, "",
-                        /*HasNUW=*/true, /*HasNSW=*/true);
-  VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
-  Value *NewPred =
-      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
-                              {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
-  LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
-
-  Value *PredHasActiveLanes =
-      Builder.CreateExtractElement(NewPred, uint64_t(0));
-  BranchInst *VectorLoopBranchBack =
-      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
-  Builder.Insert(VectorLoopBranchBack);
-
-  // If we found a mismatch then we need to calculate which lane in the vector
-  // had a mismatch and add that on to the current loop index.
-  Builder.SetInsertPoint(VectorLoopMismatchBlock);
-  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
-  FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
-  PHINode *LastLoopPred =
-      Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
-  LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
-  PHINode *VectorFoundIndex =
-      Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
-  VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
-
-  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
-  Value *Ctz = Builder.CreateIntrinsic(
-      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
-      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
-  Ctz = Builder.CreateZExt(Ctz, I64Type);
-  Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
-                                             /*HasNUW=*/true, /*HasNSW=*/true);
-  Value *VectorLoopRes = Builder.CreateTrunc(VectorLoopRes64, ResType);
+  Value *VectorLoopRes =
+      createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd);
 
   Builder.Insert(BranchInst::Create(EndBlock));
 



More information about the llvm-commits mailing list