[llvm] [WIP][LoopIdiomVectorize] Recognize and transform minidx pattern (PR #144987)

Thu Jun 19 23:22:40 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Madhur Amilkanthwar (madhur13490)

<details>
<summary>Changes</summary>

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark.

Pre-commit test can be found as part of #141556

---

Patch is 62.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144987.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp (+714) 
- (added) llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll (+291) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae0..afb6f6aea4d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -70,10 +70,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -99,6 +101,11 @@ static cl::opt<bool>
                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
                             "not convert byte-compare loop(s)."));
 
+static cl::opt<bool> DisableMinMaxlocPattern(
+    "disable-loop-idiom-vectorize-minmaxloc", cl::Hidden, cl::init(false),
+    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
+             "not convert minloc/maxloc loop(s)."));
+
 static cl::opt<unsigned>
     ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
               cl::desc("The vectorization factor for byte-compare patterns."),
@@ -149,6 +156,13 @@ class LoopIdiomVectorize {
 
   bool recognizeByteCompare();
 
+  bool recognizeMinIdxPattern();
+
+  bool transformMinIdxPattern(unsigned VF, Value *FirstIndex,
+                              Value *SecondIndex, BasicBlock *LoopPreheader,
+                              Value *BasePtr, BasicBlock *Header,
+                              BasicBlock *ExitBB, Type *LoadType);
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
@@ -239,9 +253,709 @@ bool LoopIdiomVectorize::run(Loop *L) {
   if (recognizeFindFirstByte())
     return true;
 
+  if (recognizeMinIdxPattern())
+    return true;
+
   return false;
 }
 
+bool LoopIdiomVectorize::recognizeMinIdxPattern() {
+  BasicBlock *Header = CurLoop->getHeader();
+  Function *F = Header->getParent();
+  BasicBlock *LoopPreheader = CurLoop->getLoopPreheader();
+
+  if (!TTI->supportsScalableVectors() || DisableMinMaxlocPattern) {
+    LLVM_DEBUG(dbgs() << "Does not meet pre-requisites for minidx idiom\n");
+    return false;
+  }
+
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) {
+    LLVM_DEBUG(dbgs() << "Loop does not match the required number of "
+                         "have 1 back edge and 3 blocks and backedges\n");
+    return false;
+  }
+
+  if (Header->sizeWithoutDebug() < 14) {
+    LLVM_DEBUG(dbgs() << "Header block is too small for minloc pattern\n");
+    return false;
+  }
+
+  // We need the below things to be able to transform the pattern:
+  // 1. Fist index. For this we look at the terminator instruction of
+  // the predecessor of the loop preheader. The condition of the terminator
+  // instruction decides whether to jump to scalar loop.
+  // 2. Second index.
+  // 3. Base pointer.
+  // For 2 and 3, we iterate backward from the header block to find the select
+  // instruction. The select instruction should be of the form select (fcmp
+  // contract olt loadA, loadB). Firther details below. Once we find the
+  // required pattern, we can extract the base pointer from the first load
+  // instruction
+  // 4. Exit basic block. For this we look at the terminator instruction of the
+  // header block.
+
+  // Extract the first index from the preheader.
+  // Example LLVM IR to inspect:
+  // %4 = load i32, ptr %1, align 4
+  // %5 = load i32, ptr %0, align 4
+  // %6 = sext i32 %5 to i64
+  // %7 = sub i32 0, %4
+  // %8 = sext i32 %7 to i64
+  // %9 = add nsw i64 %8, %6
+  // %10 = sub nsw i64 0, %9
+  // %invariant.gep = ...
+  // %invariant.gep1 = ...
+  // %11 = icmp slt i64 %9, 0
+  // br i1 %11, label %.loop_preheader, ...
+  Value *ICmpSLTFirstVal = nullptr, *FirstIndex = nullptr;
+  BasicBlock *LoopPreheaderBB = nullptr, *RetBB = nullptr;
+  BasicBlock *PreheaderPred = LoopPreheader->getSinglePredecessor();
+  if (!match(PreheaderPred->getTerminator(),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Value(ICmpSLTFirstVal),
+                                 m_ZeroInt()),
+                  m_BasicBlock(LoopPreheaderBB), m_BasicBlock(RetBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  // The Add operand can be either below:
+  // 1. add(sext(sub(0 - ipos2)), sext(ipos1))
+  // 2. add(sext(ipos1), sext(sub(0 - ipos2)))
+  // This depends on whether canonicalization has been done or not.
+  if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())),
+                                   (m_SExt(m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(1);
+  } else if (match(ICmpSLTFirstVal,
+                   m_Add(m_SExt(m_Value()),
+                         m_SExt(m_Sub(m_ZeroInt(), m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(0);
+  } else {
+    LLVM_DEBUG(dbgs() << "Cannot extract FirstIndex from ICmpSLTFirstVal\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "FirstIndex is " << *FirstIndex << "\n");
+
+  BasicBlock::reverse_iterator RI = Header->rbegin();
+  SelectInst *SelectToInspect = nullptr;
+  Value *BasePtr = nullptr;
+  Instruction *Trunc = nullptr;
+
+  // Iterate in backward direction to extract the select instruction which
+  // matches the pattern:
+
+  // %load1_gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  // %load1 = load float, ptr %load1_gep, align 4
+  // %load2_gep = getelementptr float, ptr ..., ...
+  // %load2 = load float, ptr %load2_gep, align 4
+  // %trunc = trunc nsw i64 %indvars.iv.next to i32
+  // %fcmp = fcmp contract olt float %load1, %load2
+  // %select = select i1 %fcmp, i32 %trunc, i32 <phi>
+  // %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  while (RI != Header->rend()) {
+    if (auto *Sel = dyn_cast<SelectInst>(&*RI)) {
+      if (match(Sel, m_Select(m_SpecificFCmp(
+                                  FCmpInst::FCMP_OLT,
+                                  m_Load(m_GEP(m_Value(BasePtr), m_Value())),
+                                  m_Load(m_GEP(m_Value(), m_Value()))),
+                              m_Instruction(Trunc), m_Value()))) {
+        SelectToInspect = Sel;
+      }
+    }
+    ++RI;
+  }
+  if (!SelectToInspect || !BasePtr) {
+    LLVM_DEBUG(dbgs() << "Select or BasePtr not found\n");
+    return false;
+  }
+
+  // Extract FCmp and validate load types
+  auto *FCmp = dyn_cast<FCmpInst>(SelectToInspect->getCondition());
+  if (!FCmp || !isa<LoadInst>(FCmp->getOperand(0)) ||
+      !isa<LoadInst>(FCmp->getOperand(1)))
+    return false;
+
+  auto *LoadA = cast<LoadInst>(FCmp->getOperand(0));
+  auto *LoadB = cast<LoadInst>(FCmp->getOperand(1));
+
+  if (LoadA->getType() != LoadB->getType()) {
+    LLVM_DEBUG(dbgs() << "Load types don't match\n");
+    return false;
+  }
+
+  // Validate truncation instruction matches expected pattern
+  TruncInst *TInst = dyn_cast<TruncInst>(Trunc);
+  if (!TInst || TInst->getDestTy() != F->getReturnType()) {
+    LLVM_DEBUG(dbgs() << "Trunc instruction validation failed\n");
+    return false;
+  }
+  // Trunc instruction's operand should be of the form (add IVPHI, -1).
+  Instruction *IVInst = nullptr;
+  if (!match(TInst->getOperand(0),
+             m_Add(m_Instruction(IVInst), m_SpecificInt(-1)))) {
+    LLVM_DEBUG(
+        dbgs() << "Trunc instruction operand doesn't match expected pattern\n");
+    return false;
+  }
+
+  PHINode *IVPhi = dyn_cast<PHINode>(IVInst);
+  if (!IVPhi) {
+    LLVM_DEBUG(dbgs() << "Add operand of trunc instruction is not a PHINode\n");
+    return false;
+  }
+
+  Value *SecondIndex = IVPhi->getIncomingValueForBlock(LoopPreheader);
+  LLVM_DEBUG(dbgs() << "SecondIndex is " << *SecondIndex << "\n");
+
+  // 4. Inspect Terminator to extract the exit block.
+  // Example LLVM IR to inspect:
+  //   %20 = icmp sgt i64 %13, 1
+  //   br i1 %20, label %.lr.ph, label %._crit_edge.loopexit
+  Value *ICmpFirstVal = nullptr;
+  BasicBlock *FalseBB = nullptr;
+  BranchInst *Terminator = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!match(Terminator, m_Br(m_SpecificICmp(ICmpInst::ICMP_SGT,
+                                             m_Value(ICmpFirstVal), m_One()),
+                              m_BasicBlock(Header), m_BasicBlock(FalseBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  unsigned VF = 128 / LoadA->getType()->getPrimitiveSizeInBits();
+
+  // We've recognized the pattern, now transform it.
+  LLVM_DEBUG(dbgs() << "FOUND MINIDX PATTERN\n");
+
+  return transformMinIdxPattern(VF, FirstIndex, SecondIndex, LoopPreheader,
+                                BasePtr, Header, FalseBB, LoadA->getType());
+}
+
+bool LoopIdiomVectorize::transformMinIdxPattern(
+    unsigned VF, Value *FirstIndex, Value *SecondIndex,
+    BasicBlock *LoopPreheader, Value *BasePtr, BasicBlock *Header,
+    BasicBlock *ExitBB, Type *LoadType) {
+
+  LLVMContext &Ctx = Header->getContext();
+  Function *F = Header->getParent();
+  Module *M = F->getParent();
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+  Type *I64Ty = Type::getInt64Ty(Ctx);
+  Type *I1Ty = Type::getInt1Ty(Ctx);
+  Type *PointerType = PointerType::get(Ctx, 0);
+  auto *MaskTy = ScalableVectorType::get(Type::getInt1Ty(Ctx), 4);
+  auto *VecTy = ScalableVectorType::get(
+      LoadType, VF); // This is the vector type for i32 values
+
+  BasicBlock *VecEntry = BasicBlock::Create(Ctx, "minidx.vec.entry", F);
+  BasicBlock *MinIdxPartial1If =
+      BasicBlock::Create(Ctx, "minidx.partial.1.if", F);
+  BasicBlock *MinIdxPartial1ProcExit =
+      BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F);
+  BasicBlock *MinIdxWhileBodyLrPh =
+      BasicBlock::Create(Ctx, "minidx.while.body.ph", F);
+  BasicBlock *MinIdxVectBody = BasicBlock::Create(Ctx, "minidx.vect.body", F);
+  BasicBlock *MinIdxVectUpdate =
+      BasicBlock::Create(Ctx, "minidx.vect.update", F);
+  BasicBlock *MinIdxVectContinue =
+      BasicBlock::Create(Ctx, "minidx.vect.continue", F);
+  BasicBlock *MinIdxVectEnd = BasicBlock::Create(Ctx, "minidx.vect.end", F);
+  BasicBlock *MinIdxPartial2If =
+      BasicBlock::Create(Ctx, "minidx.partial.2.if", F);
+  BasicBlock *MinIdxPartial2Exit =
+      BasicBlock::Create(Ctx, "minidx.partial.2.exit", F);
+  BasicBlock *MinIdxEnd = BasicBlock::Create(Ctx, "minidx.end", F);
+
+  Loop *VecLoop = LI->AllocateLoop();
+  VecLoop->addBasicBlockToLoop(MinIdxVectBody, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectUpdate, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectContinue, *LI);
+
+  LI->addTopLevelLoop(VecLoop);
+
+  // Start populating preheader.
+  IRBuilder<> Builder(LoopPreheader->getTerminator());
+  // %VScale = tail call i64 @llvm.vscale.i64()
+  // %VLen = shl nuw nsw i64 %VScale, 2
+  // %minidx.not = sub nsw i64 0, %VLen
+  // %minidx.and = and i64 %ipos2, %minidx.not
+  Value *GMax = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                          ConstantFP::getInfinity(LoadType, 0),
+                                          "minloc.gmax");
+  Value *VScale = Builder.CreateVScale(I64Ty);
+  Value *VLen =
+      Builder.CreateShl(VScale, ConstantInt::get(I64Ty, 2), "minidx.vlen");
+  Value *Not =
+      Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not");
+  // Value *Ipos2Minus1 = Builder.CreateSub(IncomingPos2,
+  // ConstantInt::get(I64Ty, 1), "minidx.ipos2.minus1");
+  Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and");
+
+  // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %ipos1)
+  // %minidx.add = add i64 %ipos2, 1
+  Value *Umax = Builder.CreateIntrinsic(
+      Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax");
+  Value *Add =
+      Builder.CreateAdd(SecondIndex, ConstantInt::get(I64Ty, 1), "minidx.add");
+  // %minidx.mask = call <vscale x 4 x i1>
+  // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %minidx.umax, i64 %minidx.add)
+  Value *MinlocMask = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask,
+                                        {MaskTy, I64Ty}),
+      {Umax, Add}, "minidx.mask");
+
+  // %minidx.add.ptr.i = getelementptr inbounds nuw float, ptr %p, i64
+  // %minidx.umax %minidx.masked.load = tail call <vscale x 4 x float>
+  // @llvm.masked.load.nxv4f32.p0(ptr %minidx.add.ptr.i, i32 1, <vscale x 4 x
+  // i1> %minidx.mask, <vscale x 4 x float> zeroinitializer) %minidx.currentVals
+  // = select <vscale x 4 x i1> %minidx.mask, <vscale x 4 x float>
+  // %minidx.masked.load, <vscale x 4 x float> splat (float 0x7FF0000000000000)
+  // %minidx.reverse = tail call <vscale x 4 x i1>
+  // @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %minidx.mask)
+  // %minidx.reverseVals = tail call <vscale x 4 x float>
+  // @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %minidx.currentVals)
+  // %minidx.minVal = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x
+  // 4 x float> %minidx.reverseVals)
+
+  Value *UmaxMinus1 =
+      Builder.CreateSub(Umax, ConstantInt::get(I64Ty, 1), "minidx.umax.minus1");
+  Value *AddPtrI = Builder.CreateInBoundsGEP(LoadType, BasePtr, UmaxMinus1,
+                                             "minidx.add.ptr.i");
+
+  Value *LoadVals =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::masked_load, {VecTy, PointerType}),
+                         {AddPtrI, ConstantInt::get(I32Ty, 1), MinlocMask,
+                          Constant::getNullValue(VecTy)},
+                         "minidx.loadVals");
+  Value *CurrentVals =
+      Builder.CreateSelect(MinlocMask, LoadVals, GMax, "minidx.currentVals");
+  Value *Reverse = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}),
+      {MinlocMask}, "minidx.reverse");
+  Value *ReverseVals = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+      {CurrentVals}, "minidx.reverseVals");
+  Value *MinVal =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::vector_reduce_fminimum, {VecTy}),
+                         {ReverseVals}, "minidx.minVal");
+
+  Builder.CreateCondBr(Builder.getTrue(), VecEntry, Header);
+  LoopPreheader->getTerminator()->eraseFromParent();
+
+  // Add edge from preheader to VecEntry
+  DTU.applyUpdates({{DominatorTree::Insert, LoopPreheader, VecEntry}});
+
+  // %minidx.entry.cmp = fcmp olt float %minidx.minVal, %init
+  // br i1 %minidx.entry.cmp, label %minidx.partial.1.if, label
+  // %minidx.partial.1.proc.exit
+  Builder.SetInsertPoint(VecEntry);
+  Value *VecEntryCmp = Builder.CreateFCmpOLT(
+      MinVal, ConstantFP::getInfinity(LoadType, 0), "minidx.entry.cmp");
+  Builder.CreateCondBr(VecEntryCmp, MinIdxPartial1If, MinIdxPartial1ProcExit);
+
+  // Connect edges from VecEntry to MinIdxPartial1If and MinIdxPartial1ProcExit
+  DTU.applyUpdates({{DominatorTree::Insert, VecEntry, MinIdxPartial1If},
+                    {DominatorTree::Insert, VecEntry, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1If);
+  // %minVal.splatinsert = insertelement <vscale x 4 x float> poison, float
+  // %minidx.minVal, i64 0 %minVal.splat = shufflevector <vscale x 4 x float>
+  // %minVal.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32>
+  // zeroinitializer
+  Value *MinValSplat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                                 MinVal, "minval.splat");
+  // %minidx.partial.1.cmp = fcmp oeq <vscale x 4 x float> %minidx.reverseVals,
+  // %minVal.splat %minidx.partial.1.and = and <vscale x 4 x i1>
+  // %minidx.reverse, %minidx.partial.1.cmp %minidx.partial.1.cttz = tail call
+  // i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+  // %minidx.partial.1.and, i1 true)
+  Value *FirstPartialCmp =
+      Builder.CreateFCmpOEQ(ReverseVals, MinValSplat, "minidx.partial.1.cmp");
+  Value *FirstPartialAnd =
+      Builder.CreateAnd(Reverse, FirstPartialCmp, "minidx.partial.1.and");
+  Value *FirstPartialCTTZ = Builder.CreateCountTrailingZeroElems(
+      I64Ty, FirstPartialAnd, ConstantInt::get(I1Ty, 1),
+      "minidx.partial.1.cttz");
+
+  // FIXME this pattern
+  // %minidx.partial.1.xor = xor i64 %minidx.partial.1.cttz, -1
+  // %minidx.partial.1.add1 = add i64 %minidx.umax, %VLen
+  // %minidx.partial.1.add2 = add i64 %minidx.partial.1.add1,
+  // %minidx.partial.1.xor br label %minidx.partial.1.proc.exit
+  Value *FirstPartialTmp1 =
+      Builder.CreateSub(VLen, FirstPartialCTTZ, "minidx.partial.1.tmp");
+  Value *FirstPartialTmp =
+      Builder.CreateSub(FirstPartialTmp1, ConstantInt::get(I64Ty, 1),
+                        "minidx.partial.1.tmp.minus1");
+  Value *FirstPartialAdd2 =
+      Builder.CreateAdd(Umax, FirstPartialTmp, "minidx.partial.1.add2");
+
+  Builder.CreateBr(MinIdxPartial1ProcExit);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1If, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1ProcExit);
+  // %minidx.partial.1.exit.known_min = phi float [ %minidx.minVal,
+  // %minidx.partial.1.if ], [ %init, %entry ] %partial1.exit.known_arg = phi
+  // i64 [ %minidx.partial.1.add2, %minidx.partial.1.if ], [ 0, %entry ]
+  PHINode *Partial1ExitKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.partial.1.exit.known_min");
+  PHINode *Partial1ExitKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "partial1.exit.known_arg");
+
+  Partial1ExitKnownMin->addIncoming(MinVal, MinIdxPartial1If);
+  Partial1ExitKnownMin->addIncoming(ConstantFP::getInfinity(LoadType, 0),
+                                    VecEntry);
+  Partial1ExitKnownArg->addIncoming(FirstPartialAdd2, MinIdxPartial1If);
+  Partial1ExitKnownArg->addIncoming(ConstantInt::get(I64Ty, 0), VecEntry);
+
+  // %minidx.partial.1.proc.exit.add = add i64 %VLen, %ipos1
+  // %minidx.partial.1.proc.exit.icmp = icmp ult i64 %minidx.umax,
+  // %minidx.partial.1.proc.exit.add br i1 %minidx.partial.1.proc.exit.icmp,
+  // label %minidx.vect.end, label %minidx.while.body.ph
+  Value *MinIdxPartial1ProcExitAdd =
+      Builder.CreateAdd(VLen, FirstIndex, "minidx.partial.1.proc.exit.add");
+  Value *MinIdxPartial1ProcExitICmp = Builder.CreateICmpULT(
+      Umax, MinIdxPartial1ProcExitAdd, "minidx.partial.1.proc.exit.icmp");
+  Builder.CreateCondBr(MinIdxPartial1ProcExitICmp, MinIdxVectEnd,
+                       MinIdxWhileBodyLrPh);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxVectEnd},
+       {DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxWhileBodyLrPh}});
+
+  Builder.SetInsertPoint(MinIdxWhileBodyLrPh);
+  // %minidx.while.body.ph.mul = mul nsw i64 %VScale, -16
+  // %minidx.while.body.ph.gep = getelementptr i8, ptr %p, i64
+  // %minidx.while.body.ph.mul br label %minidx.vect.body
+  Builder.CreateBr(MinIdxVectBody);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxWhileBodyLrPh, MinIdxVectBody}});
+
+  Builder.SetInsertPoint(MinIdxVectBody);
+  // %minidx.vect.body.phi1 = phi i64 [ %minidx.umax, %minidx.while.body.ph ], [
+  // %minidx.vect.body.sub, %minidx.vect.continue ] %minidx.vect.body.known_arg
+  // = phi i64 [ %partial1.exit.known_arg, %minidx.while.body.ph ], [
+  // %minidx.vect.continue.known_arg, %minidx.vect.continue ]
+  // %minidx.vect.body.known_min = phi float [ %minidx.partial.1.exit.known_min,
+  // %minidx.while.body.ph ], [ %minidx.vect.continue.known_min,
+  // %minidx.vect.continue ]
+  PHINode *MinIdxVectBodyPhi1 =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.phi1");
+  PHINode *MinIdxVectBodyKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.known_arg");
+  PHINode *MinIdxVectBodyKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.vect.body.known_min");
+
+  // %minidx.vect.body.sub = sub i64 %minidx.vect.body.phi1, %VLen
+  // %minidx.vect.body.shl = shl i64 %minidx.vect.body.phi1, 2
+  // %minid...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/144987