[llvm] [WIP][LoopIdiomVectorize] Recognize and transform minidx pattern (PR #144987)
Madhur Amilkanthwar via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 19 23:22:08 PDT 2025
https://github.com/madhur13490 created https://github.com/llvm/llvm-project/pull/144987
This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark.
Pre-commit test can be found as part of #141556
>From d9dd731e232eb8de7afd5c5407403c0a7f21b5cf Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Thu, 19 Jun 2025 04:41:35 -0700
Subject: [PATCH] [WIP][LoopIdiomVectorize] Recognize and transform minidx
pattern
This patch vectorizes the case where the array scan happens backwards
and first minidx is returned. Motivating example is found in
rnflow FORTRAN benchmark.
Pre-commit test can be found as part of #141556
---
.../Vectorize/LoopIdiomVectorize.cpp | 714 ++++++++++++++++++
.../LoopVectorize/last-min-index-ftn.ll | 291 +++++++
2 files changed, 1005 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae0..afb6f6aea4d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -70,10 +70,12 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdint>
using namespace llvm;
using namespace PatternMatch;
@@ -99,6 +101,11 @@ static cl::opt<bool>
cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
"not convert byte-compare loop(s)."));
+static cl::opt<bool> DisableMinMaxlocPattern(
+ "disable-loop-idiom-vectorize-minmaxloc", cl::Hidden, cl::init(false),
+ cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
+ "not convert minloc/maxloc loop(s)."));
+
static cl::opt<unsigned>
ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
cl::desc("The vectorization factor for byte-compare patterns."),
@@ -149,6 +156,13 @@ class LoopIdiomVectorize {
bool recognizeByteCompare();
+ bool recognizeMinIdxPattern();
+
+ bool transformMinIdxPattern(unsigned VF, Value *FirstIndex,
+ Value *SecondIndex, BasicBlock *LoopPreheader,
+ Value *BasePtr, BasicBlock *Header,
+ BasicBlock *ExitBB, Type *LoadType);
+
Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
Instruction *Index, Value *Start, Value *MaxLen);
@@ -239,9 +253,709 @@ bool LoopIdiomVectorize::run(Loop *L) {
if (recognizeFindFirstByte())
return true;
+ if (recognizeMinIdxPattern())
+ return true;
+
return false;
}
+bool LoopIdiomVectorize::recognizeMinIdxPattern() {
+ BasicBlock *Header = CurLoop->getHeader();
+ Function *F = Header->getParent();
+ BasicBlock *LoopPreheader = CurLoop->getLoopPreheader();
+
+ if (!TTI->supportsScalableVectors() || DisableMinMaxlocPattern) {
+ LLVM_DEBUG(dbgs() << "Does not meet pre-requisites for minidx idiom\n");
+ return false;
+ }
+
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) {
+ LLVM_DEBUG(dbgs() << "Loop does not match the required number of "
+ "have 1 back edge and 3 blocks and backedges\n");
+ return false;
+ }
+
+ if (Header->sizeWithoutDebug() < 14) {
+ LLVM_DEBUG(dbgs() << "Header block is too small for minloc pattern\n");
+ return false;
+ }
+
+ // We need the below things to be able to transform the pattern:
+ // 1. Fist index. For this we look at the terminator instruction of
+ // the predecessor of the loop preheader. The condition of the terminator
+ // instruction decides whether to jump to scalar loop.
+ // 2. Second index.
+ // 3. Base pointer.
+ // For 2 and 3, we iterate backward from the header block to find the select
+ // instruction. The select instruction should be of the form select (fcmp
+ // contract olt loadA, loadB). Firther details below. Once we find the
+ // required pattern, we can extract the base pointer from the first load
+ // instruction
+ // 4. Exit basic block. For this we look at the terminator instruction of the
+ // header block.
+
+ // Extract the first index from the preheader.
+ // Example LLVM IR to inspect:
+ // %4 = load i32, ptr %1, align 4
+ // %5 = load i32, ptr %0, align 4
+ // %6 = sext i32 %5 to i64
+ // %7 = sub i32 0, %4
+ // %8 = sext i32 %7 to i64
+ // %9 = add nsw i64 %8, %6
+ // %10 = sub nsw i64 0, %9
+ // %invariant.gep = ...
+ // %invariant.gep1 = ...
+ // %11 = icmp slt i64 %9, 0
+ // br i1 %11, label %.loop_preheader, ...
+ Value *ICmpSLTFirstVal = nullptr, *FirstIndex = nullptr;
+ BasicBlock *LoopPreheaderBB = nullptr, *RetBB = nullptr;
+ BasicBlock *PreheaderPred = LoopPreheader->getSinglePredecessor();
+ if (!match(PreheaderPred->getTerminator(),
+ m_Br(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Value(ICmpSLTFirstVal),
+ m_ZeroInt()),
+ m_BasicBlock(LoopPreheaderBB), m_BasicBlock(RetBB)))) {
+ LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+ return false;
+ }
+
+ // The Add operand can be either below:
+ // 1. add(sext(sub(0 - ipos2)), sext(ipos1))
+ // 2. add(sext(ipos1), sext(sub(0 - ipos2)))
+ // This depends on whether canonicalization has been done or not.
+ if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())),
+ (m_SExt(m_Value()))))) {
+ FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(1);
+ } else if (match(ICmpSLTFirstVal,
+ m_Add(m_SExt(m_Value()),
+ m_SExt(m_Sub(m_ZeroInt(), m_Value()))))) {
+ FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(0);
+ } else {
+ LLVM_DEBUG(dbgs() << "Cannot extract FirstIndex from ICmpSLTFirstVal\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "FirstIndex is " << *FirstIndex << "\n");
+
+ BasicBlock::reverse_iterator RI = Header->rbegin();
+ SelectInst *SelectToInspect = nullptr;
+ Value *BasePtr = nullptr;
+ Instruction *Trunc = nullptr;
+
+ // Iterate in backward direction to extract the select instruction which
+ // matches the pattern:
+
+ // %load1_gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+ // %load1 = load float, ptr %load1_gep, align 4
+ // %load2_gep = getelementptr float, ptr ..., ...
+ // %load2 = load float, ptr %load2_gep, align 4
+ // %trunc = trunc nsw i64 %indvars.iv.next to i32
+ // %fcmp = fcmp contract olt float %load1, %load2
+ // %select = select i1 %fcmp, i32 %trunc, i32 <phi>
+ // %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ while (RI != Header->rend()) {
+ if (auto *Sel = dyn_cast<SelectInst>(&*RI)) {
+ if (match(Sel, m_Select(m_SpecificFCmp(
+ FCmpInst::FCMP_OLT,
+ m_Load(m_GEP(m_Value(BasePtr), m_Value())),
+ m_Load(m_GEP(m_Value(), m_Value()))),
+ m_Instruction(Trunc), m_Value()))) {
+ SelectToInspect = Sel;
+ }
+ }
+ ++RI;
+ }
+ if (!SelectToInspect || !BasePtr) {
+ LLVM_DEBUG(dbgs() << "Select or BasePtr not found\n");
+ return false;
+ }
+
+ // Extract FCmp and validate load types
+ auto *FCmp = dyn_cast<FCmpInst>(SelectToInspect->getCondition());
+ if (!FCmp || !isa<LoadInst>(FCmp->getOperand(0)) ||
+ !isa<LoadInst>(FCmp->getOperand(1)))
+ return false;
+
+ auto *LoadA = cast<LoadInst>(FCmp->getOperand(0));
+ auto *LoadB = cast<LoadInst>(FCmp->getOperand(1));
+
+ if (LoadA->getType() != LoadB->getType()) {
+ LLVM_DEBUG(dbgs() << "Load types don't match\n");
+ return false;
+ }
+
+ // Validate truncation instruction matches expected pattern
+ TruncInst *TInst = dyn_cast<TruncInst>(Trunc);
+ if (!TInst || TInst->getDestTy() != F->getReturnType()) {
+ LLVM_DEBUG(dbgs() << "Trunc instruction validation failed\n");
+ return false;
+ }
+ // Trunc instruction's operand should be of the form (add IVPHI, -1).
+ Instruction *IVInst = nullptr;
+ if (!match(TInst->getOperand(0),
+ m_Add(m_Instruction(IVInst), m_SpecificInt(-1)))) {
+ LLVM_DEBUG(
+ dbgs() << "Trunc instruction operand doesn't match expected pattern\n");
+ return false;
+ }
+
+ PHINode *IVPhi = dyn_cast<PHINode>(IVInst);
+ if (!IVPhi) {
+ LLVM_DEBUG(dbgs() << "Add operand of trunc instruction is not a PHINode\n");
+ return false;
+ }
+
+ Value *SecondIndex = IVPhi->getIncomingValueForBlock(LoopPreheader);
+ LLVM_DEBUG(dbgs() << "SecondIndex is " << *SecondIndex << "\n");
+
+ // 4. Inspect Terminator to extract the exit block.
+ // Example LLVM IR to inspect:
+ // %20 = icmp sgt i64 %13, 1
+ // br i1 %20, label %.lr.ph, label %._crit_edge.loopexit
+ Value *ICmpFirstVal = nullptr;
+ BasicBlock *FalseBB = nullptr;
+ BranchInst *Terminator = dyn_cast<BranchInst>(Header->getTerminator());
+ if (!match(Terminator, m_Br(m_SpecificICmp(ICmpInst::ICMP_SGT,
+ m_Value(ICmpFirstVal), m_One()),
+ m_BasicBlock(Header), m_BasicBlock(FalseBB)))) {
+ LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+ return false;
+ }
+
+ unsigned VF = 128 / LoadA->getType()->getPrimitiveSizeInBits();
+
+ // We've recognized the pattern, now transform it.
+ LLVM_DEBUG(dbgs() << "FOUND MINIDX PATTERN\n");
+
+ return transformMinIdxPattern(VF, FirstIndex, SecondIndex, LoopPreheader,
+ BasePtr, Header, FalseBB, LoadA->getType());
+}
+
+bool LoopIdiomVectorize::transformMinIdxPattern(
+ unsigned VF, Value *FirstIndex, Value *SecondIndex,
+ BasicBlock *LoopPreheader, Value *BasePtr, BasicBlock *Header,
+ BasicBlock *ExitBB, Type *LoadType) {
+
+ LLVMContext &Ctx = Header->getContext();
+ Function *F = Header->getParent();
+ Module *M = F->getParent();
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ Type *I32Ty = Type::getInt32Ty(Ctx);
+ Type *I64Ty = Type::getInt64Ty(Ctx);
+ Type *I1Ty = Type::getInt1Ty(Ctx);
+ Type *PointerType = PointerType::get(Ctx, 0);
+ auto *MaskTy = ScalableVectorType::get(Type::getInt1Ty(Ctx), 4);
+ auto *VecTy = ScalableVectorType::get(
+ LoadType, VF); // This is the vector type for i32 values
+
+ BasicBlock *VecEntry = BasicBlock::Create(Ctx, "minidx.vec.entry", F);
+ BasicBlock *MinIdxPartial1If =
+ BasicBlock::Create(Ctx, "minidx.partial.1.if", F);
+ BasicBlock *MinIdxPartial1ProcExit =
+ BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F);
+ BasicBlock *MinIdxWhileBodyLrPh =
+ BasicBlock::Create(Ctx, "minidx.while.body.ph", F);
+ BasicBlock *MinIdxVectBody = BasicBlock::Create(Ctx, "minidx.vect.body", F);
+ BasicBlock *MinIdxVectUpdate =
+ BasicBlock::Create(Ctx, "minidx.vect.update", F);
+ BasicBlock *MinIdxVectContinue =
+ BasicBlock::Create(Ctx, "minidx.vect.continue", F);
+ BasicBlock *MinIdxVectEnd = BasicBlock::Create(Ctx, "minidx.vect.end", F);
+ BasicBlock *MinIdxPartial2If =
+ BasicBlock::Create(Ctx, "minidx.partial.2.if", F);
+ BasicBlock *MinIdxPartial2Exit =
+ BasicBlock::Create(Ctx, "minidx.partial.2.exit", F);
+ BasicBlock *MinIdxEnd = BasicBlock::Create(Ctx, "minidx.end", F);
+
+ Loop *VecLoop = LI->AllocateLoop();
+ VecLoop->addBasicBlockToLoop(MinIdxVectBody, *LI);
+ VecLoop->addBasicBlockToLoop(MinIdxVectUpdate, *LI);
+ VecLoop->addBasicBlockToLoop(MinIdxVectContinue, *LI);
+
+ LI->addTopLevelLoop(VecLoop);
+
+ // Start populating preheader.
+ IRBuilder<> Builder(LoopPreheader->getTerminator());
+ // %VScale = tail call i64 @llvm.vscale.i64()
+ // %VLen = shl nuw nsw i64 %VScale, 2
+ // %minidx.not = sub nsw i64 0, %VLen
+ // %minidx.and = and i64 %ipos2, %minidx.not
+ Value *GMax = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+ ConstantFP::getInfinity(LoadType, 0),
+ "minloc.gmax");
+ Value *VScale = Builder.CreateVScale(I64Ty);
+ Value *VLen =
+ Builder.CreateShl(VScale, ConstantInt::get(I64Ty, 2), "minidx.vlen");
+ Value *Not =
+ Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not");
+ // Value *Ipos2Minus1 = Builder.CreateSub(IncomingPos2,
+ // ConstantInt::get(I64Ty, 1), "minidx.ipos2.minus1");
+ Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and");
+
+ // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %ipos1)
+ // %minidx.add = add i64 %ipos2, 1
+ Value *Umax = Builder.CreateIntrinsic(
+ Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax");
+ Value *Add =
+ Builder.CreateAdd(SecondIndex, ConstantInt::get(I64Ty, 1), "minidx.add");
+ // %minidx.mask = call <vscale x 4 x i1>
+ // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %minidx.umax, i64 %minidx.add)
+ Value *MinlocMask = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask,
+ {MaskTy, I64Ty}),
+ {Umax, Add}, "minidx.mask");
+
+ // %minidx.add.ptr.i = getelementptr inbounds nuw float, ptr %p, i64
+ // %minidx.umax %minidx.masked.load = tail call <vscale x 4 x float>
+ // @llvm.masked.load.nxv4f32.p0(ptr %minidx.add.ptr.i, i32 1, <vscale x 4 x
+ // i1> %minidx.mask, <vscale x 4 x float> zeroinitializer) %minidx.currentVals
+ // = select <vscale x 4 x i1> %minidx.mask, <vscale x 4 x float>
+ // %minidx.masked.load, <vscale x 4 x float> splat (float 0x7FF0000000000000)
+ // %minidx.reverse = tail call <vscale x 4 x i1>
+ // @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %minidx.mask)
+ // %minidx.reverseVals = tail call <vscale x 4 x float>
+ // @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %minidx.currentVals)
+ // %minidx.minVal = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x
+ // 4 x float> %minidx.reverseVals)
+
+ Value *UmaxMinus1 =
+ Builder.CreateSub(Umax, ConstantInt::get(I64Ty, 1), "minidx.umax.minus1");
+ Value *AddPtrI = Builder.CreateInBoundsGEP(LoadType, BasePtr, UmaxMinus1,
+ "minidx.add.ptr.i");
+
+ Value *LoadVals =
+ Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::masked_load, {VecTy, PointerType}),
+ {AddPtrI, ConstantInt::get(I32Ty, 1), MinlocMask,
+ Constant::getNullValue(VecTy)},
+ "minidx.loadVals");
+ Value *CurrentVals =
+ Builder.CreateSelect(MinlocMask, LoadVals, GMax, "minidx.currentVals");
+ Value *Reverse = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}),
+ {MinlocMask}, "minidx.reverse");
+ Value *ReverseVals = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+ {CurrentVals}, "minidx.reverseVals");
+ Value *MinVal =
+ Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vector_reduce_fminimum, {VecTy}),
+ {ReverseVals}, "minidx.minVal");
+
+ Builder.CreateCondBr(Builder.getTrue(), VecEntry, Header);
+ LoopPreheader->getTerminator()->eraseFromParent();
+
+ // Add edge from preheader to VecEntry
+ DTU.applyUpdates({{DominatorTree::Insert, LoopPreheader, VecEntry}});
+
+ // %minidx.entry.cmp = fcmp olt float %minidx.minVal, %init
+ // br i1 %minidx.entry.cmp, label %minidx.partial.1.if, label
+ // %minidx.partial.1.proc.exit
+ Builder.SetInsertPoint(VecEntry);
+ Value *VecEntryCmp = Builder.CreateFCmpOLT(
+ MinVal, ConstantFP::getInfinity(LoadType, 0), "minidx.entry.cmp");
+ Builder.CreateCondBr(VecEntryCmp, MinIdxPartial1If, MinIdxPartial1ProcExit);
+
+ // Connect edges from VecEntry to MinIdxPartial1If and MinIdxPartial1ProcExit
+ DTU.applyUpdates({{DominatorTree::Insert, VecEntry, MinIdxPartial1If},
+ {DominatorTree::Insert, VecEntry, MinIdxPartial1ProcExit}});
+
+ Builder.SetInsertPoint(MinIdxPartial1If);
+ // %minVal.splatinsert = insertelement <vscale x 4 x float> poison, float
+ // %minidx.minVal, i64 0 %minVal.splat = shufflevector <vscale x 4 x float>
+ // %minVal.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32>
+ // zeroinitializer
+ Value *MinValSplat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+ MinVal, "minval.splat");
+ // %minidx.partial.1.cmp = fcmp oeq <vscale x 4 x float> %minidx.reverseVals,
+ // %minVal.splat %minidx.partial.1.and = and <vscale x 4 x i1>
+ // %minidx.reverse, %minidx.partial.1.cmp %minidx.partial.1.cttz = tail call
+ // i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+ // %minidx.partial.1.and, i1 true)
+ Value *FirstPartialCmp =
+ Builder.CreateFCmpOEQ(ReverseVals, MinValSplat, "minidx.partial.1.cmp");
+ Value *FirstPartialAnd =
+ Builder.CreateAnd(Reverse, FirstPartialCmp, "minidx.partial.1.and");
+ Value *FirstPartialCTTZ = Builder.CreateCountTrailingZeroElems(
+ I64Ty, FirstPartialAnd, ConstantInt::get(I1Ty, 1),
+ "minidx.partial.1.cttz");
+
+ // FIXME this pattern
+ // %minidx.partial.1.xor = xor i64 %minidx.partial.1.cttz, -1
+ // %minidx.partial.1.add1 = add i64 %minidx.umax, %VLen
+ // %minidx.partial.1.add2 = add i64 %minidx.partial.1.add1,
+ // %minidx.partial.1.xor br label %minidx.partial.1.proc.exit
+ Value *FirstPartialTmp1 =
+ Builder.CreateSub(VLen, FirstPartialCTTZ, "minidx.partial.1.tmp");
+ Value *FirstPartialTmp =
+ Builder.CreateSub(FirstPartialTmp1, ConstantInt::get(I64Ty, 1),
+ "minidx.partial.1.tmp.minus1");
+ Value *FirstPartialAdd2 =
+ Builder.CreateAdd(Umax, FirstPartialTmp, "minidx.partial.1.add2");
+
+ Builder.CreateBr(MinIdxPartial1ProcExit);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxPartial1If, MinIdxPartial1ProcExit}});
+
+ Builder.SetInsertPoint(MinIdxPartial1ProcExit);
+ // %minidx.partial.1.exit.known_min = phi float [ %minidx.minVal,
+ // %minidx.partial.1.if ], [ %init, %entry ] %partial1.exit.known_arg = phi
+ // i64 [ %minidx.partial.1.add2, %minidx.partial.1.if ], [ 0, %entry ]
+ PHINode *Partial1ExitKnownMin =
+ Builder.CreatePHI(LoadType, 2, "minidx.partial.1.exit.known_min");
+ PHINode *Partial1ExitKnownArg =
+ Builder.CreatePHI(I64Ty, 2, "partial1.exit.known_arg");
+
+ Partial1ExitKnownMin->addIncoming(MinVal, MinIdxPartial1If);
+ Partial1ExitKnownMin->addIncoming(ConstantFP::getInfinity(LoadType, 0),
+ VecEntry);
+ Partial1ExitKnownArg->addIncoming(FirstPartialAdd2, MinIdxPartial1If);
+ Partial1ExitKnownArg->addIncoming(ConstantInt::get(I64Ty, 0), VecEntry);
+
+ // %minidx.partial.1.proc.exit.add = add i64 %VLen, %ipos1
+ // %minidx.partial.1.proc.exit.icmp = icmp ult i64 %minidx.umax,
+ // %minidx.partial.1.proc.exit.add br i1 %minidx.partial.1.proc.exit.icmp,
+ // label %minidx.vect.end, label %minidx.while.body.ph
+ Value *MinIdxPartial1ProcExitAdd =
+ Builder.CreateAdd(VLen, FirstIndex, "minidx.partial.1.proc.exit.add");
+ Value *MinIdxPartial1ProcExitICmp = Builder.CreateICmpULT(
+ Umax, MinIdxPartial1ProcExitAdd, "minidx.partial.1.proc.exit.icmp");
+ Builder.CreateCondBr(MinIdxPartial1ProcExitICmp, MinIdxVectEnd,
+ MinIdxWhileBodyLrPh);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxVectEnd},
+ {DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxWhileBodyLrPh}});
+
+ Builder.SetInsertPoint(MinIdxWhileBodyLrPh);
+ // %minidx.while.body.ph.mul = mul nsw i64 %VScale, -16
+ // %minidx.while.body.ph.gep = getelementptr i8, ptr %p, i64
+ // %minidx.while.body.ph.mul br label %minidx.vect.body
+ Builder.CreateBr(MinIdxVectBody);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxWhileBodyLrPh, MinIdxVectBody}});
+
+ Builder.SetInsertPoint(MinIdxVectBody);
+ // %minidx.vect.body.phi1 = phi i64 [ %minidx.umax, %minidx.while.body.ph ], [
+ // %minidx.vect.body.sub, %minidx.vect.continue ] %minidx.vect.body.known_arg
+ // = phi i64 [ %partial1.exit.known_arg, %minidx.while.body.ph ], [
+ // %minidx.vect.continue.known_arg, %minidx.vect.continue ]
+ // %minidx.vect.body.known_min = phi float [ %minidx.partial.1.exit.known_min,
+ // %minidx.while.body.ph ], [ %minidx.vect.continue.known_min,
+ // %minidx.vect.continue ]
+ PHINode *MinIdxVectBodyPhi1 =
+ Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.phi1");
+ PHINode *MinIdxVectBodyKnownArg =
+ Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.known_arg");
+ PHINode *MinIdxVectBodyKnownMin =
+ Builder.CreatePHI(LoadType, 2, "minidx.vect.body.known_min");
+
+ // %minidx.vect.body.sub = sub i64 %minidx.vect.body.phi1, %VLen
+ // %minidx.vect.body.shl = shl i64 %minidx.vect.body.phi1, 2
+ // %minidx.vect.body.gep = getelementptr i8, ptr %minidx.while.body.ph.gep,
+ // i64 %minidx.vect.body.shl
+ Value *MinIdxVectBodySub =
+ Builder.CreateSub(MinIdxVectBodyPhi1, VLen, "minidx.vect.body.sub");
+ Value *MinIdxVectBodyShl =
+ Builder.CreateSub(MinIdxVectBodySub, ConstantInt::get(I64Ty, 1),
+ "minidx.vect.body.sub.minus1");
+ Value *MinIdxVectBodyGEP = Builder.CreateInBoundsGEP(
+ LoadType, BasePtr, MinIdxVectBodyShl, "minidx.vect.body.gep");
+
+ // %minidx.vect.body.unmaskedload = load <vscale x 4 x float>, ptr
+ // %minidx.vect.body.gep, align 1 %minidx.vect.body.load.rev = tail call
+ // <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>
+ // %minidx.vect.body.unmaskedload) %minidx.vect.body.load.reduce = tail call
+ // float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>
+ // %minidx.vect.body.load.rev)
+ Value *MinIdxVectBodyUnmaskedLoad = Builder.CreateLoad(
+ VecTy, MinIdxVectBodyGEP, "minidx.vect.body.unmaskedload");
+ Value *MinIdxVectBodyReverse = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+ {MinIdxVectBodyUnmaskedLoad}, "minidx.vect.body.reverse");
+ Value *MinIdxVectBodyReduce =
+ Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::vector_reduce_fminimum, {VecTy}),
+ {MinIdxVectBodyReverse}, "minidx.vect.body.reduce");
+
+ // %minidx.vect.body.fcmp = fcmp olt float %minidx.vect.body.load.reduce,
+ // %minidx.vect.body.known_min br i1 %minidx.vect.body.fcmp, label
+ // %minidx.vect.update, label %minidx.vect.continue
+ Value *MinIdxVectBodyFCmp = Builder.CreateFCmpOLT(
+ MinIdxVectBodyReduce, MinIdxVectBodyKnownMin, "minidx.vect.body.fcmp");
+
+ Builder.CreateCondBr(MinIdxVectBodyFCmp, MinIdxVectUpdate,
+ MinIdxVectContinue);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxVectBody, MinIdxVectUpdate},
+ {DominatorTree::Insert, MinIdxVectBody, MinIdxVectContinue}});
+
+ Builder.SetInsertPoint(MinIdxVectUpdate);
+ // %minidx.vect.update.splatinsert = insertelement <vscale x 4 x float>
+ // poison, float %minidx.vect.body.load.reduce, i64 0
+ // %minidx.vect.update.splat = shufflevector <vscale x 4 x float>
+ // %minidx.vect.update.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x
+ // i32> zeroinitializer %minidx.vect.update.fcmp = fcmp ueq <vscale x 4 x
+ // float> %minidx.vect.body.load.rev, %minidx.vect.update.splat
+ Value *MinIdxVectUpdateSplat = Builder.CreateVectorSplat(
+ ElementCount::getScalable(VF), MinIdxVectBodyReduce,
+ "minidx.vect.update.splatinsert");
+ Value *MinIdxVectUpdateFCmp = Builder.CreateFCmpUEQ(
+ MinIdxVectBodyReverse, MinIdxVectUpdateSplat, "minidx.vect.update.fcmp");
+
+ // %minidx.vect.update.cttz = call i64
+ // @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+ // %minidx.vect.update.fcmp, i1 true) %minidx.vect.update.mul = mul i64
+ // %minidx.vect.update.cttz, -1 %minidx.vect.update.add = add i64
+ // %minidx.vect.body.phi1, %minidx.vect.update.mul
+ Value *MinIdxVectUpdateCTTZ = Builder.CreateCountTrailingZeroElems(
+ I64Ty, MinIdxVectUpdateFCmp, ConstantInt::get(I1Ty, 1),
+ "minidx.vect.update.cttz");
+ Value *MinIdxVectUpdateMul =
+ Builder.CreateMul(MinIdxVectUpdateCTTZ, ConstantInt::get(I64Ty, -1),
+ "minidx.vect.update.mul");
+ Value *MinIdxVectUpdateAdd = Builder.CreateAdd(
+ MinIdxVectBodyPhi1, MinIdxVectUpdateMul, "minidx.vect.update.add");
+
+ // %minidx.vect.body.add2 = add i64 %minidx.vect.update.add, -1
+ // br label %minidx.vect.continue
+ Value *MinIdxVectBodyAdd2 =
+ Builder.CreateAdd(MinIdxVectUpdateAdd, ConstantInt::get(I64Ty, -1),
+ "minidx.vect.body.add2");
+ Builder.CreateBr(MinIdxVectContinue);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxVectUpdate, MinIdxVectContinue}});
+
+ Builder.SetInsertPoint(MinIdxVectContinue);
+ // %minidx.vect.continue.known_min = phi float [
+ // %minidx.vect.body.load.reduce, %minidx.vect.update ], [
+ // %minidx.vect.body.known_min, %minidx.vect.body ]
+ // %minidx.vect.continue.known_arg = phi i64 [ %minidx.vect.body.add2,
+ // %minidx.vect.update ], [ %minidx.vect.body.known_arg, %minidx.vect.body ]
+ // %minidx.vect.continue.icmp = icmp ult i64 %minidx.vect.body.sub,
+ // %minidx.partial.1.proc.exit.add
+ PHINode *MinIdxVectContinueKnownMin =
+ Builder.CreatePHI(LoadType, 2, "minidx.vect.continue.known_min");
+ PHINode *MinIdxVectContinueKnownArg =
+ Builder.CreatePHI(I64Ty, 2, "minidx.vect.continue.known_arg");
+
+ MinIdxVectContinueKnownMin->addIncoming(MinIdxVectBodyReduce,
+ MinIdxVectUpdate);
+ MinIdxVectContinueKnownMin->addIncoming(MinIdxVectBodyKnownMin,
+ MinIdxVectBody);
+ MinIdxVectContinueKnownArg->addIncoming(MinIdxVectBodyAdd2, MinIdxVectUpdate);
+ MinIdxVectContinueKnownArg->addIncoming(MinIdxVectBodyKnownArg,
+ MinIdxVectBody);
+
+ // br i1 %minidx.vect.continue.icmp, label %minidx.vect.end, label
+ // %minidx.vect.body
+ Value *MinIdxVectContinueICmp =
+ Builder.CreateICmpULT(MinIdxVectBodySub, MinIdxPartial1ProcExitAdd,
+ "minidx.vect.continue.icmp");
+
+ Builder.CreateCondBr(MinIdxVectContinueICmp, MinIdxVectEnd, MinIdxVectBody);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxVectContinue, MinIdxVectEnd},
+ {DominatorTree::Insert, MinIdxVectContinue, MinIdxVectBody}});
+
+ Builder.SetInsertPoint(MinIdxVectEnd);
+ // %minidx.vect.end.known_min.lcssa = phi float [
+ // %minidx.partial.1.exit.known_min, %minidx.partial.1.proc.exit ], [
+ // %minidx.vect.continue.known_min, %minidx.vect.continue ]
+ // %minidx.vect.end.known_arg.lcssa = phi i64 [ %partial1.exit.known_arg,
+ // %minidx.partial.1.proc.exit ], [ %known_arg.3, %minidx.vect.continue ]
+ // %minidx.vect.end.lcssa = phi i64 [ %minidx.umax,
+ // %minidx.partial.1.proc.exit ], [ %minidx.vect.body.sub,
+ // %minidx.vect.continue ]
+ PHINode *MinIdxVectEndKnownMin =
+ Builder.CreatePHI(LoadType, 2, "minidx.vect.end.known_min.lcssa");
+ PHINode *MinIdxVectEndKnownArg =
+ Builder.CreatePHI(I64Ty, 2, "minidx.vect.end.known_arg.lcssa");
+ PHINode *MinIdxVectEndLCSSA =
+ Builder.CreatePHI(I64Ty, 2, "minidx.vect.end.lcssa");
+
+ MinIdxVectEndKnownMin->addIncoming(Partial1ExitKnownMin,
+ MinIdxPartial1ProcExit);
+ MinIdxVectEndKnownMin->addIncoming(MinIdxVectContinueKnownMin,
+ MinIdxVectContinue);
+ MinIdxVectEndKnownArg->addIncoming(Partial1ExitKnownArg,
+ MinIdxPartial1ProcExit);
+ MinIdxVectEndKnownArg->addIncoming(MinIdxVectContinueKnownArg,
+ MinIdxVectContinue);
+ MinIdxVectEndLCSSA->addIncoming(Umax, MinIdxPartial1ProcExit);
+ MinIdxVectEndLCSSA->addIncoming(MinIdxVectBodySub, MinIdxVectContinue);
+
+ // %minidx.vect.end.icmp = icmp ugt i64 %minidx.vect.end.lcssa, %ipos1
+ // br i1 %minidx.vect.end.icmp, label %minidx.partial.2.if, label %minidx.end
+
+ Value *MinIdxVectEndCmp = Builder.CreateICmpUGT(
+ MinIdxVectEndLCSSA, FirstIndex, "minidx.vect.end.cmp");
+ Builder.CreateCondBr(MinIdxVectEndCmp, MinIdxPartial2If, MinIdxEnd);
+ DTU.applyUpdates({{DominatorTree::Insert, MinIdxVectEnd, MinIdxPartial2If},
+ {DominatorTree::Insert, MinIdxVectEnd, MinIdxEnd}});
+
+ Builder.SetInsertPoint(MinIdxPartial2If);
+ // %minidx.partial.2.if.add = add nuw i64 %minidx.vect.end.lcssa, 1
+ // %minidx.partial.2.if.mask = call <vscale x 4 x i1>
+ // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %ipos1, i64
+ // %minidx.partial.2.if.add) %minidx.partial.2.if.gep = getelementptr inbounds
+ // nuw float, ptr %p, i64 %ipos1
+ Value *MinIdxPartial2IfAdd =
+ Builder.CreateAdd(MinIdxVectEndLCSSA, ConstantInt::get(I64Ty, 0),
+ "minidx.partial.2.if.add.zero");
+ Value *MinIdxPartial2IfMask = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask,
+ {MaskTy, I64Ty}),
+ {FirstIndex, MinIdxPartial2IfAdd}, "minidx.partial.2.if.mask");
+
+ // Reverse the mask.
+ MinIdxPartial2IfMask = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}),
+ {MinIdxPartial2IfMask}, "minidx.partial.2.if.mask.reverse");
+
+ Value *IncomingIposMinus1 =
+ Builder.CreateSub(FirstIndex, ConstantInt::get(I64Ty, 1),
+ "minidx.partial.2.if.ipos1.minus1");
+ Value *MinIdxPartial2IfGEP = Builder.CreateInBoundsGEP(
+ LoadType, BasePtr, IncomingIposMinus1, "minidx.partial.2.if.gep");
+
+ // %minidx.partial.2.if.load = tail call <vscale x 4 x float>
+ // @llvm.masked.load.nxv4f32.p0(ptr %minidx.partial.2.if.gep, i32 1, <vscale x
+ // 4 x i1> %minidx.partial.2.if.mask, <vscale x 4 x float> zeroinitializer)
+ // %minidx.partial.2.if.rev = tail call <vscale x 4 x float>
+ // @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>
+ // %minidx.partial.2.if.load) %minidx.partial.2.if.reduce = tail call float
+ // @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>
+ // %minidx.partial.2.if.rev)
+ Value *MinIdxPartial2IfLoad =
+ Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::masked_load, {VecTy, PointerType}),
+ {MinIdxPartial2IfGEP, ConstantInt::get(I32Ty, 1),
+ MinIdxPartial2IfMask, Constant::getNullValue(VecTy)},
+ "minidx.partial.2.if.load");
+ Value *MinIdxPartial2IfReverse = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+ {MinIdxPartial2IfLoad}, "minidx.partial.2.if.reverse");
+ Value *MinIdxPartial2IfReduce = Builder.CreateCall(
+ Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reduce_fminimum,
+ {VecTy}),
+ {MinIdxPartial2IfReverse}, "minidx.partial.2.if.reduce");
+
+ // %minidx.partial.2.if.fcmp = fcmp olt float %minidx.partial.2.if.reduce,
+ // %minidx.vect.end.known_min.lcssa br i1 %minidx.partial.2.if.fcmp, label
+ // %minidx.partial.2.exit, label %minidx.end
+ Value *MinIdxPartial2IfFCmp =
+ Builder.CreateFCmpOLT(MinIdxPartial2IfReduce, MinIdxVectEndKnownMin,
+ "minidx.partial.2.if.fcmp");
+ Builder.CreateCondBr(MinIdxPartial2IfFCmp, MinIdxPartial2Exit, MinIdxEnd);
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinIdxPartial2If, MinIdxPartial2Exit},
+ {DominatorTree::Insert, MinIdxPartial2If, MinIdxEnd}});
+
+ Builder.SetInsertPoint(MinIdxPartial2Exit);
+ // %minidx.partial.2.exit.splatinsert = insertelement <vscale x 4 x float>
+ // poison, float %minidx.partial.2.if.reduce, i64 0
+ // %minidx.partial.2.exit.splat = shufflevector <vscale x 4 x float>
+ // %minidx.partial.2.exit.splatinsert, <vscale x 4 x float> poison, <vscale x
+ // 4 x i32> zeroinitializer %minidx.partial.2.exit.fcmp = fcmp oeq <vscale x 4
+ // x float> %minidx.partial.2.if.rev, %minidx.partial.2.exit.splat
+ Value *MinIdxPartial2ExitSplat = Builder.CreateVectorSplat(
+ ElementCount::getScalable(VF), MinIdxPartial2IfReduce,
+ "minidx.partial.2.exit.splatinsert");
+ Value *MinIdxPartial2ExitFCmp =
+ Builder.CreateFCmpOEQ(MinIdxPartial2IfReverse, MinIdxPartial2ExitSplat,
+ "minidx.partial.2.exit.fcmp");
+
+ // %minidx.partial.2.exit.and = and <vscale x 4 x i1>
+ // %minidx.partial.2.exit.fcmp, %minidx.partial.2.if.mask
+ // %minidx.partial.2.exit.cttz = call i64
+ // @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+ // %minidx.partial.2.exit.and, i1 true)
+ Value *MinIdxPartial2ExitAnd =
+ Builder.CreateAnd(MinIdxPartial2ExitFCmp, MinIdxPartial2IfMask,
+ "minidx.partial.2.exit.and");
+ Value *MinIdxPartial2ExitCTTZ = Builder.CreateCountTrailingZeroElems(
+ I64Ty, MinIdxPartial2ExitAnd, ConstantInt::get(I1Ty, 1),
+ "minidx.partial.2.exit.cttz");
+
+ Value *MinIdxPartial2ExitTmp1 = Builder.CreateSub(
+ VLen, MinIdxPartial2ExitCTTZ, "minidx.partial.2.exit.tmp");
+ Value *MinIdxPartial2ExitTmp =
+ Builder.CreateSub(MinIdxPartial2ExitTmp1, ConstantInt::get(I64Ty, 1),
+ "minidx.partial.2.exit.tmp.minus1");
+ Value *MinIdxPartial2ExitAdd = Builder.CreateAdd(
+ FirstIndex, MinIdxPartial2ExitTmp, "minidx.partial.2.exit.add2");
+
+ // %minidx.partial.2.exit.xor = xor i64 %minidx.partial.2.exit.cttz, -1
+ // %minidx.partial.2.add = add i64 %minidx.partial.1.proc.exit.add,
+ // %minidx.partial.2.exit.xor br label %minidx.end
+ Builder.CreateBr(MinIdxEnd);
+
+ DTU.applyUpdates({{DominatorTree::Insert, MinIdxPartial2Exit, MinIdxEnd}});
+
+ Builder.SetInsertPoint(MinIdxEnd);
+ // %minidx.ret = phi i64 [ %minidx.vect.end.known_arg.lcssa, %minidx.vect.end
+ // ], [ %minidx.partial.2.add, %minidx.partial.2.exit ], [
+ // %minidx.vect.end.known_arg.lcssa, %minidx.partial.2.if ] br label %ExitBB
+ PHINode *MinIdxRet = Builder.CreatePHI(I64Ty, 3, "minidx.ret");
+ MinIdxRet->addIncoming(MinIdxVectEndKnownArg, MinIdxVectEnd);
+ MinIdxRet->addIncoming(MinIdxPartial2ExitAdd, MinIdxPartial2Exit);
+ MinIdxRet->addIncoming(MinIdxVectEndKnownArg, MinIdxPartial2If);
+
+ // create bitcast.
+ Value *MinIdxRetBitCast = Builder.CreateTruncOrBitCast(
+ MinIdxRet, F->getReturnType(), "minidx.ret.bitcast");
+
+ Builder.CreateBr(ExitBB);
+ DTU.applyUpdates({{DominatorTree::Insert, MinIdxEnd, ExitBB}});
+
+ MinIdxVectBodyPhi1->addIncoming(Umax, MinIdxWhileBodyLrPh);
+ MinIdxVectBodyPhi1->addIncoming(MinIdxVectBodySub, MinIdxVectContinue);
+
+ MinIdxVectBodyKnownArg->addIncoming(Partial1ExitKnownArg,
+ MinIdxWhileBodyLrPh);
+ MinIdxVectBodyKnownArg->addIncoming(MinIdxVectContinueKnownArg,
+ MinIdxVectContinue);
+
+ MinIdxVectBodyKnownMin->addIncoming(Partial1ExitKnownMin,
+ MinIdxWhileBodyLrPh);
+ MinIdxVectBodyKnownMin->addIncoming(MinIdxVectContinueKnownMin,
+ MinIdxVectContinue);
+
+ // Collect PHIs that need to be replaced
+ SmallVector<PHINode *, 8> PHIsToReplace;
+ for (PHINode &PHI : ExitBB->phis()) {
+ PHIsToReplace.push_back(&PHI);
+ }
+
+ // Now perform the replacement
+ for (PHINode *PHI : PHIsToReplace) {
+ // Create PHI at the beginning of the block
+ Builder.SetInsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
+ PHINode *ExitPHI =
+ Builder.CreatePHI(F->getReturnType(), PHI->getNumIncomingValues() + 1);
+ for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
+ ExitPHI->addIncoming(PHI->getIncomingValue(I), PHI->getIncomingBlock(I));
+ }
+ ExitPHI->addIncoming(MinIdxRetBitCast, MinIdxEnd);
+ // Replace all uses of PHI with ExitPHI.
+ PHI->replaceAllUsesWith(ExitPHI);
+ PHI->eraseFromParent();
+ }
+
+ VecLoop->verifyLoop();
+ if (!VecLoop->isRecursivelyLCSSAForm(*DT, *LI)) {
+ LLVM_DEBUG(dbgs() << "Loop is not in LCSSA form\n");
+ VecLoop->print(dbgs());
+ VecLoop->dump();
+ }
+
+ return true;
+}
+
bool LoopIdiomVectorize::recognizeByteCompare() {
// Currently the transformation only works on scalable vector types, although
// there is no fundamental reason why it cannot be made to work for fixed
diff --git a/llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll b/llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll
new file mode 100644
index 0000000000000..013bb45b44e2c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll
@@ -0,0 +1,291 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW1-IL4
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL1
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL2
+; RUN: opt -passes=loop-idiom-vectorize -S -mtriple=aarch64 -mattr=+sve %s | FileCheck %s --check-prefix=CHECK-LOOP-IDIOM
+
+; This test case is extracted from rnflow (fortran) benchmark in polyhedron benchmark suite.
+; The function minlst primarily takes two indices (i.e. range), scans backwards in the range
+; and returns the firstIV of the minimum value.
+
+define i32 @minlst(i32 %first_index, i32 %last_index, ptr %array) {
+; CHECK-REV-MIN-VW1-IL4-LABEL: define i32 @minlst(
+; CHECK-REV-MIN-VW1-IL4-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) {
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[ENTRY:.*]]:
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0
+; CHECK-REV-MIN-VW1-IL4-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK-REV-MIN-VW1-IL4: [[LOOP_PREHEADER]]:
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64
+; CHECK-REV-MIN-VW1-IL4-NEXT: br label %[[LOOP:.*]]
+; CHECK-REV-MIN-VW1-IL4: [[LOOP]]:
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1
+; CHECK-REV-MIN-VW1-IL4-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]]
+; CHECK-REV-MIN-VW1-IL4: [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ]
+; CHECK-REV-MIN-VW1-IL4-NEXT: br [[DOT_CRIT_EDGE]]
+; CHECK-REV-MIN-VW1-IL4: [[__CRIT_EDGE:.*:]]
+; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-REV-MIN-VW1-IL4-NEXT: ret i32 [[LAST_INDEX_RET]]
+;
+; CHECK-REV-MIN-VW4-IL1-LABEL: define i32 @minlst(
+; CHECK-REV-MIN-VW4-IL1-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) {
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[ENTRY:.*]]:
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0
+; CHECK-REV-MIN-VW4-IL1-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK-REV-MIN-VW4-IL1: [[LOOP_PREHEADER]]:
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL1-NEXT: br label %[[LOOP:.*]]
+; CHECK-REV-MIN-VW4-IL1: [[LOOP]]:
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1
+; CHECK-REV-MIN-VW4-IL1-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]]
+; CHECK-REV-MIN-VW4-IL1: [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ]
+; CHECK-REV-MIN-VW4-IL1-NEXT: br [[DOT_CRIT_EDGE]]
+; CHECK-REV-MIN-VW4-IL1: [[__CRIT_EDGE:.*:]]
+; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-REV-MIN-VW4-IL1-NEXT: ret i32 [[LAST_INDEX_RET]]
+;
+; CHECK-REV-MIN-VW4-IL2-LABEL: define i32 @minlst(
+; CHECK-REV-MIN-VW4-IL2-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) {
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[ENTRY:.*]]:
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0
+; CHECK-REV-MIN-VW4-IL2-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK-REV-MIN-VW4-IL2: [[LOOP_PREHEADER]]:
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL2-NEXT: br label %[[LOOP:.*]]
+; CHECK-REV-MIN-VW4-IL2: [[LOOP]]:
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1
+; CHECK-REV-MIN-VW4-IL2-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]]
+; CHECK-REV-MIN-VW4-IL2: [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ]
+; CHECK-REV-MIN-VW4-IL2-NEXT: br [[DOT_CRIT_EDGE]]
+; CHECK-REV-MIN-VW4-IL2: [[__CRIT_EDGE:.*:]]
+; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-REV-MIN-VW4-IL2-NEXT: ret i32 [[LAST_INDEX_RET]]
+;
+; CHECK-LOOP-IDIOM-LABEL: define i32 @minlst(
+; CHECK-LOOP-IDIOM-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-LOOP-IDIOM-NEXT: [[ENTRY:.*]]:
+; CHECK-LOOP-IDIOM-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64
+; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]]
+; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64
+; CHECK-LOOP-IDIOM-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]]
+; CHECK-LOOP-IDIOM-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]]
+; CHECK-LOOP-IDIOM-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8
+; CHECK-LOOP-IDIOM-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4
+; CHECK-LOOP-IDIOM-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK-LOOP-IDIOM: [[LOOP_PREHEADER]]:
+; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64
+; CHECK-LOOP-IDIOM-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VLEN:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_NOT:%.*]] = sub i64 0, [[MINIDX_VLEN]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_AND:%.*]] = and i64 [[LAST_INDEX_SEXT]], [[MINIDX_NOT]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_UMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[MINIDX_AND]], i64 [[FIRST_INDEX_SEXT]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ADD:%.*]] = add i64 [[LAST_INDEX_SEXT]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[MINIDX_UMAX]], i64 [[MINIDX_ADD]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_UMAX_MINUS1:%.*]] = sub i64 [[MINIDX_UMAX]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ADD_PTR_I:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_UMAX_MINUS1]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_LOADVALS:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[MINIDX_ADD_PTR_I]], i32 1, <vscale x 4 x i1> [[MINIDX_MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_CURRENTVALS:%.*]] = select <vscale x 4 x i1> [[MINIDX_MASK]], <vscale x 4 x float> [[MINIDX_LOADVALS]], <vscale x 4 x float> splat (float 0x7FF0000000000000)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[MINIDX_MASK]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_REVERSEVALS:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[MINIDX_CURRENTVALS]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_MINVAL:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float> [[MINIDX_REVERSEVALS]])
+; CHECK-LOOP-IDIOM-NEXT: br i1 true, label %[[MINIDX_VEC_ENTRY:.*]], label %[[LOOP:.*]]
+; CHECK-LOOP-IDIOM: [[LOOP]]:
+; CHECK-LOOP-IDIOM-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-LOOP-IDIOM-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]]
+; CHECK-LOOP-IDIOM-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4
+; CHECK-LOOP-IDIOM-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-LOOP-IDIOM-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]]
+; CHECK-LOOP-IDIOM-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4
+; CHECK-LOOP-IDIOM-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]]
+; CHECK-LOOP-IDIOM-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32
+; CHECK-LOOP-IDIOM-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]]
+; CHECK-LOOP-IDIOM-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1
+; CHECK-LOOP-IDIOM-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]]
+; CHECK-LOOP-IDIOM: [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-LOOP-IDIOM-NEXT: [[TMP1:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ], [ [[MINIDX_RET_BITCAST:%.*]], %[[MINIDX_END:.*]] ]
+; CHECK-LOOP-IDIOM-NEXT: br [[DOT_CRIT_EDGE]]
+; CHECK-LOOP-IDIOM: [[__CRIT_EDGE:.*:]]
+; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[TMP1]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-LOOP-IDIOM-NEXT: ret i32 [[LAST_INDEX_RET]]
+; CHECK-LOOP-IDIOM: [[MINIDX_VEC_ENTRY]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ENTRY_CMP:%.*]] = fcmp olt float [[MINIDX_MINVAL]], 0x7FF0000000000000
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_ENTRY_CMP]], label %[[MINIDX_PARTIAL_1_IF:.*]], label %[[MINIDX_PARTIAL_1_PROC_EXIT:.*]]
+; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_1_IF]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINVAL_SPLAT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[MINIDX_MINVAL]], i64 0
+; CHECK-LOOP-IDIOM-NEXT: [[MINVAL_SPLAT_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[MINVAL_SPLAT_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_CMP:%.*]] = fcmp oeq <vscale x 4 x float> [[MINIDX_REVERSEVALS]], [[MINVAL_SPLAT_SPLAT]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_AND:%.*]] = and <vscale x 4 x i1> [[MINIDX_REVERSE]], [[MINIDX_PARTIAL_1_CMP]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[MINIDX_PARTIAL_1_AND]], i1 true)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_TMP:%.*]] = sub i64 [[MINIDX_VLEN]], [[MINIDX_PARTIAL_1_CTTZ]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_TMP_MINUS1:%.*]] = sub i64 [[MINIDX_PARTIAL_1_TMP]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_ADD2:%.*]] = add i64 [[MINIDX_UMAX]], [[MINIDX_PARTIAL_1_TMP_MINUS1]]
+; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_PARTIAL_1_PROC_EXIT]]
+; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_1_PROC_EXIT]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN:%.*]] = phi float [ [[MINIDX_MINVAL]], %[[MINIDX_PARTIAL_1_IF]] ], [ 0x7FF0000000000000, %[[MINIDX_VEC_ENTRY]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[PARTIAL1_EXIT_KNOWN_ARG:%.*]] = phi i64 [ [[MINIDX_PARTIAL_1_ADD2]], %[[MINIDX_PARTIAL_1_IF]] ], [ 0, %[[MINIDX_VEC_ENTRY]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_PROC_EXIT_ADD:%.*]] = add i64 [[MINIDX_VLEN]], [[FIRST_INDEX_SEXT]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_PROC_EXIT_ICMP:%.*]] = icmp ult i64 [[MINIDX_UMAX]], [[MINIDX_PARTIAL_1_PROC_EXIT_ADD]]
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_PARTIAL_1_PROC_EXIT_ICMP]], label %[[MINIDX_VECT_END:.*]], label %[[MINIDX_WHILE_BODY_PH:.*]]
+; CHECK-LOOP-IDIOM: [[MINIDX_WHILE_BODY_PH]]:
+; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_VECT_BODY:.*]]
+; CHECK-LOOP-IDIOM: [[MINIDX_VECT_BODY]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_PHI1:%.*]] = phi i64 [ [[MINIDX_UMAX]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_BODY_SUB:%.*]], %[[MINIDX_VECT_CONTINUE:.*]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_KNOWN_ARG:%.*]] = phi i64 [ [[PARTIAL1_EXIT_KNOWN_ARG]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_ARG:%.*]], %[[MINIDX_VECT_CONTINUE]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_KNOWN_MIN:%.*]] = phi float [ [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_MIN:%.*]], %[[MINIDX_VECT_CONTINUE]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_SUB]] = sub i64 [[MINIDX_VECT_BODY_PHI1]], [[MINIDX_VLEN]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_SUB_MINUS1:%.*]] = sub i64 [[MINIDX_VECT_BODY_SUB]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_GEP:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_VECT_BODY_SUB_MINUS1]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_UNMASKEDLOAD:%.*]] = load <vscale x 4 x float>, ptr [[MINIDX_VECT_BODY_GEP]], align 16
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[MINIDX_VECT_BODY_UNMASKEDLOAD]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_REDUCE:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float> [[MINIDX_VECT_BODY_REVERSE]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_FCMP:%.*]] = fcmp olt float [[MINIDX_VECT_BODY_REDUCE]], [[MINIDX_VECT_BODY_KNOWN_MIN]]
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_BODY_FCMP]], label %[[MINIDX_VECT_UPDATE:.*]], label %[[MINIDX_VECT_CONTINUE]]
+; CHECK-LOOP-IDIOM: [[MINIDX_VECT_UPDATE]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[MINIDX_VECT_BODY_REDUCE]], i64 0
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_FCMP:%.*]] = fcmp ueq <vscale x 4 x float> [[MINIDX_VECT_BODY_REVERSE]], [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLAT]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[MINIDX_VECT_UPDATE_FCMP]], i1 true)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_MUL:%.*]] = mul i64 [[MINIDX_VECT_UPDATE_CTTZ]], -1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_ADD:%.*]] = add i64 [[MINIDX_VECT_BODY_PHI1]], [[MINIDX_VECT_UPDATE_MUL]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_ADD2:%.*]] = add i64 [[MINIDX_VECT_UPDATE_ADD]], -1
+; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_VECT_CONTINUE]]
+; CHECK-LOOP-IDIOM: [[MINIDX_VECT_CONTINUE]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_KNOWN_MIN]] = phi float [ [[MINIDX_VECT_BODY_REDUCE]], %[[MINIDX_VECT_UPDATE]] ], [ [[MINIDX_VECT_BODY_KNOWN_MIN]], %[[MINIDX_VECT_BODY]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_KNOWN_ARG]] = phi i64 [ [[MINIDX_VECT_BODY_ADD2]], %[[MINIDX_VECT_UPDATE]] ], [ [[MINIDX_VECT_BODY_KNOWN_ARG]], %[[MINIDX_VECT_BODY]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_ICMP:%.*]] = icmp ult i64 [[MINIDX_VECT_BODY_SUB]], [[MINIDX_PARTIAL_1_PROC_EXIT_ADD]]
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_CONTINUE_ICMP]], label %[[MINIDX_VECT_END]], label %[[MINIDX_VECT_BODY]]
+; CHECK-LOOP-IDIOM: [[MINIDX_VECT_END]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_KNOWN_MIN_LCSSA:%.*]] = phi float [ [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_MIN]], %[[MINIDX_VECT_CONTINUE]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_KNOWN_ARG_LCSSA:%.*]] = phi i64 [ [[PARTIAL1_EXIT_KNOWN_ARG]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_ARG]], %[[MINIDX_VECT_CONTINUE]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_LCSSA:%.*]] = phi i64 [ [[MINIDX_UMAX]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_BODY_SUB]], %[[MINIDX_VECT_CONTINUE]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_CMP:%.*]] = icmp ugt i64 [[MINIDX_VECT_END_LCSSA]], [[FIRST_INDEX_SEXT]]
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_END_CMP]], label %[[MINIDX_PARTIAL_2_IF:.*]], label %[[MINIDX_END]]
+; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_2_IF]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_ADD_ZERO:%.*]] = add i64 [[MINIDX_VECT_END_LCSSA]], 0
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[FIRST_INDEX_SEXT]], i64 [[MINIDX_PARTIAL_2_IF_ADD_ZERO]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_MASK_REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[MINIDX_PARTIAL_2_IF_MASK]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_IPOS1_MINUS1:%.*]] = sub i64 [[FIRST_INDEX_SEXT]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_GEP:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_PARTIAL_2_IF_IPOS1_MINUS1]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[MINIDX_PARTIAL_2_IF_GEP]], i32 1, <vscale x 4 x i1> [[MINIDX_PARTIAL_2_IF_MASK_REVERSE]], <vscale x 4 x float> zeroinitializer)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[MINIDX_PARTIAL_2_IF_LOAD]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_REDUCE:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float> [[MINIDX_PARTIAL_2_IF_REVERSE]])
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_FCMP:%.*]] = fcmp olt float [[MINIDX_PARTIAL_2_IF_REDUCE]], [[MINIDX_VECT_END_KNOWN_MIN_LCSSA]]
+; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_PARTIAL_2_IF_FCMP]], label %[[MINIDX_PARTIAL_2_EXIT:.*]], label %[[MINIDX_END]]
+; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_2_EXIT]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[MINIDX_PARTIAL_2_IF_REDUCE]], i64 0
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_FCMP:%.*]] = fcmp oeq <vscale x 4 x float> [[MINIDX_PARTIAL_2_IF_REVERSE]], [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLAT]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_AND:%.*]] = and <vscale x 4 x i1> [[MINIDX_PARTIAL_2_EXIT_FCMP]], [[MINIDX_PARTIAL_2_IF_MASK_REVERSE]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[MINIDX_PARTIAL_2_EXIT_AND]], i1 true)
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_TMP:%.*]] = sub i64 [[MINIDX_VLEN]], [[MINIDX_PARTIAL_2_EXIT_CTTZ]]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_TMP_MINUS1:%.*]] = sub i64 [[MINIDX_PARTIAL_2_EXIT_TMP]], 1
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_ADD2:%.*]] = add i64 [[FIRST_INDEX_SEXT]], [[MINIDX_PARTIAL_2_EXIT_TMP_MINUS1]]
+; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_END]]
+; CHECK-LOOP-IDIOM: [[MINIDX_END]]:
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_RET:%.*]] = phi i64 [ [[MINIDX_VECT_END_KNOWN_ARG_LCSSA]], %[[MINIDX_VECT_END]] ], [ [[MINIDX_PARTIAL_2_EXIT_ADD2]], %[[MINIDX_PARTIAL_2_EXIT]] ], [ [[MINIDX_VECT_END_KNOWN_ARG_LCSSA]], %[[MINIDX_PARTIAL_2_IF]] ]
+; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_RET_BITCAST]] = trunc i64 [[MINIDX_RET]] to i32
+; CHECK-LOOP-IDIOM-NEXT: br label %[[DOT_CRIT_EDGE_LOOPEXIT]]
+;
+entry:
+ %first_index_sext = sext i32 %first_index to i64
+ %last_index_neg = sub i32 0, %last_index
+ %last_index_neg_sext = sext i32 %last_index_neg to i64
+ %add = add nsw i64 %first_index_sext, %last_index_neg_sext
+ %diff = sub nsw i64 0, %add
+ %first_ptr = getelementptr i8, ptr %array, i64 -8
+ %second_ptr = getelementptr i8, ptr %array, i64 -4
+ %early_exit_cond = icmp slt i64 %add, 0
+ br i1 %early_exit_cond, label %loop.preheader, label %._crit_edge
+
+loop.preheader: ; preds = %entry
+ %last_index_sext = sext i32 %last_index to i64
+ br label %loop
+
+loop: ; preds = %loop.preheader, %loop
+ %iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ]
+ %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ]
+ %index = phi i32 [ %select, %loop ], [ %last_index, %loop.preheader ]
+ %iv.next = add nsw i64 %iv, -1
+ %load1_ptr = getelementptr float, ptr %first_ptr, i64 %iv
+ %load1 = load float, ptr %load1_ptr, align 4
+ %index_sext = sext i32 %index to i64
+ %load2_ptr = getelementptr float, ptr %second_ptr, i64 %index_sext
+ %load2 = load float, ptr %load2_ptr, align 4
+ %cmp = fcmp contract olt float %load1, %load2
+ %iv.next.trunc = trunc nsw i64 %iv.next to i32
+ %select = select i1 %cmp, i32 %iv.next.trunc, i32 %index
+ %dec = add nsw i64 %dec_iv, -1
+ %loop_cond = icmp sgt i64 %dec_iv, 1
+ br i1 %loop_cond, label %loop, label %._crit_edge
+
+._crit_edge: ; preds = %loop, %entry
+ %last_index_ret = phi i32 [ %select, %loop ], [ %last_index, %entry ]
+ ret i32 %last_index_ret
+}
More information about the llvm-commits
mailing list