[llvm] [LV][EVL] Introduce the EVLIndVarSimplify Pass for EVL-vectorized loops (PR #131005)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 10:56:56 PDT 2025
https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/131005
>From e96162fa1920d788e9385fc0d52b7115a5c47442 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 3 Mar 2025 13:21:57 -0800
Subject: [PATCH 1/5] [LV][EVL] Introduce the EVLIVSimplify Pass for
EVL-vectorized loops
TBA...
---
.../Transforms/Vectorize/EVLIndVarSimplify.h | 31 ++
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassBuilderPipelines.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 7 +
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 +
.../Vectorize/EVLIndVarSimplify.cpp | 242 +++++++++++++
.../LoopVectorize/evl-iv-simplify.ll | 333 ++++++++++++++++++
8 files changed, 617 insertions(+)
create mode 100644 llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
create mode 100644 llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
create mode 100644 llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
new file mode 100644
index 0000000000000..3178dc762a195
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
@@ -0,0 +1,31 @@
+//===------ EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Loop;
+class LPMUpdater;
+
+/// Turn vectorized loops with canonical induction variables into loops that
+/// only use a single EVL-based induction variable.
+struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 7740f622ede7c..0be40a38a329c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -370,6 +370,7 @@
#include "llvm/Transforms/Utils/SymbolRewriter.h"
#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index f172271be09ab..857ba07d3dd70 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -142,6 +142,7 @@
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d5d1b2173da69..52bfaa948e6e7 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -672,6 +672,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch",
#endif
LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
LOOP_PASS("dot-ddg", DDGDotPrinterPass())
+LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass())
LOOP_PASS("guard-widening", GuardWideningPass())
LOOP_PASS("extra-simple-loop-unswitch-passes",
ExtraLoopPassManager<ShouldRunExtraSimpleLoopUnswitch>())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index d11ce46bf78b5..15dd4d57727dd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -37,6 +37,7 @@
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
#include <optional>
using namespace llvm;
@@ -645,6 +646,12 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
OptimizationLevel Level) {
LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
});
+
+ PB.registerVectorizerEndEPCallback(
+ [](FunctionPassManager &FPM, OptimizationLevel Level) {
+ if (Level.isOptimizingForSpeed())
+ FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass()));
+ });
}
yaml::MachineFunctionInfo *
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 0dc6a7d2f594f..2b5488b2e8126 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,4 +1,5 @@
add_llvm_component_library(LLVMVectorize
+ EVLIndVarSimplify.cpp
LoadStoreVectorizer.cpp
LoopIdiomVectorize.cpp
LoopVectorizationLegality.cpp
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
new file mode 100644
index 0000000000000..8ffe287c183f1
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -0,0 +1,242 @@
+//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+#define DEBUG_TYPE "evl-iv-simplify"
+
+using namespace llvm;
+
+STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
+
+static cl::opt<bool> EnableEVLIndVarSimplify(
+ "enable-evl-indvar-simplify",
+ cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden,
+ cl::init(true));
+
+namespace {
+struct EVLIndVarSimplifyImpl {
+ ScalarEvolution &SE;
+
+ explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
+ : SE(LAR.SE) {}
+
+ explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}
+
+ // Returns true if modify the loop.
+ bool run(Loop &L);
+};
+} // anonymous namespace
+
+// Returns the constant part of vectorization factor from the induction
+// variable's step value SCEV expression.
+static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
+ if (!Step)
+ return 0U;
+
+ // Looking for loops with IV step value in the form of `(<constant VF> x
+ // vscale)`.
+ if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
+ if (Mul->getNumOperands() == 2) {
+ const SCEV *LHS = Mul->getOperand(0);
+ const SCEV *RHS = Mul->getOperand(1);
+ if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
+ uint64_t V = Const->getAPInt().getLimitedValue();
+ if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
+ return V;
+ }
+ }
+ }
+
+ // If not, see if the vscale_range of the parent function is a fixed value,
+ // which makes the step value to be replaced by a constant.
+ if (F.hasFnAttribute(Attribute::VScaleRange))
+ if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
+ APInt V = ConstStep->getAPInt().abs();
+ ConstantRange CR = llvm::getVScaleRange(&F, 64);
+ if (const APInt *Fixed = CR.getSingleElement()) {
+ V = V.zextOrTrunc(Fixed->getBitWidth());
+ uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+ if (VF && llvm::isUInt<32>(VF) &&
+ // Make sure step is divisible by vscale.
+ V.urem(*Fixed).isZero())
+ return VF;
+ }
+ }
+
+ return 0U;
+}
+
+bool EVLIndVarSimplifyImpl::run(Loop &L) {
+ if (!EnableEVLIndVarSimplify)
+ return false;
+
+ if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized") ||
+ !getBooleanLoopAttribute(&L, "llvm.loop.isvectorized.withevl"))
+ return false;
+
+ BasicBlock *LatchBlock = L.getLoopLatch();
+ ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
+ if (!LatchBlock || !OrigLatchCmp)
+ return false;
+
+ InductionDescriptor IVD;
+ PHINode *IndVar = L.getInductionVariable(SE);
+ if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
+ LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
+ << "\n");
+ return false;
+ }
+
+ BasicBlock *InitBlock, *BackEdgeBlock;
+ if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
+ LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
+ << L.getName() << "\n");
+ return false;
+ }
+
+ // Retrieve the loop bounds.
+ std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
+ if (!Bounds) {
+ LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
+ << "\n");
+ return false;
+ }
+ Value *CanonicalIVInit = &Bounds->getInitialIVValue();
+ Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
+
+ const SCEV *StepV = IVD.getStep();
+ uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
+ if (!VF) {
+ LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
+ << "'\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
+ << "\n");
+
+ // Try to find the EVL-based induction variable.
+ using namespace PatternMatch;
+ BasicBlock *BB = IndVar->getParent();
+
+ Value *EVLIndVar = nullptr;
+ Value *RemTC = nullptr;
+ Value *TC = nullptr;
+ auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
+ m_Value(RemTC), m_SpecificInt(VF),
+ /*Scalable=*/m_SpecificInt(1));
+ for (auto &PN : BB->phis()) {
+ if (&PN == IndVar)
+ continue;
+
+ // Check 1: it has to contain both incoming (init) & backedge blocks
+ // from IndVar.
+ if (PN.getBasicBlockIndex(InitBlock) < 0 ||
+ PN.getBasicBlockIndex(BackEdgeBlock) < 0)
+ continue;
+ // Check 2: EVL index is always increasing, thus its inital value has to be
+ // equal to either the initial IV value (when the canonical IV is also
+ // increasing) or the last IV value (when canonical IV is decreasing).
+ Value *Init = PN.getIncomingValueForBlock(InitBlock);
+ using Direction = Loop::LoopBounds::Direction;
+ switch (Bounds->getDirection()) {
+ case Direction::Increasing:
+ if (Init != CanonicalIVInit)
+ continue;
+ break;
+ case Direction::Decreasing:
+ if (Init != CanonicalIVFinal)
+ continue;
+ break;
+ case Direction::Unknown:
+ // To be more permissive and see if either the initial or final IV value
+ // matches PN's init value.
+ if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
+ continue;
+ break;
+ }
+ Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
+ assert(RecValue);
+
+ LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
+ << "\n");
+
+ // Check 3: Pattern match to find the EVL-based index and total trip count
+ // (TC).
+ if (match(RecValue,
+ m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
+ match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
+ EVLIndVar = RecValue;
+ break;
+ }
+ }
+
+ if (!EVLIndVar || !TC)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+
+ // Create an EVL-based comparison and replace the branch to use it as
+ // predicate.
+
+ // Loop::getLatchCmpInst check at the beginning of this function has ensured
+ // that latch block ends in a conditional branch.
+ auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator());
+ assert(LatchBranch->isConditional());
+ ICmpInst::Predicate Pred;
+ if (LatchBranch->getSuccessor(0) == L.getHeader())
+ Pred = ICmpInst::ICMP_NE;
+ else
+ Pred = ICmpInst::ICMP_EQ;
+
+ IRBuilder<> Builder(OrigLatchCmp);
+ auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC);
+ OrigLatchCmp->replaceAllUsesWith(NewLatchCmp);
+
+ // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
+ // not used outside the cycles. However, in this case the now-RAUW-ed
+ // OrigLatchCmp will be considered a use outside the cycle while in reality
+ // it's practically dead. Thus we need to remove it before calling
+ // RecursivelyDeleteDeadPHINode.
+ (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp);
+ if (llvm::RecursivelyDeleteDeadPHINode(IndVar))
+ LLVM_DEBUG(dbgs() << "Removed original IndVar\n");
+
+ ++NumEliminatedCanonicalIV;
+
+ return true;
+}
+
+PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ if (EVLIndVarSimplifyImpl(AR).run(L))
+ return PreservedAnalyses::allInSet<CFGAnalyses>();
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
new file mode 100644
index 0000000000000..0583214855e52
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %N) vscale_range(2, 1024) {
+; CHECK-LABEL: define void @simple(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @simple(
+; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
+; LOOP-DEL: vector.ph:
+; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL: vector.body:
+; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true)
+; LOOP-DEL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
+; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; LOOP-DEL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; LOOP-DEL-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
+; LOOP-DEL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; LOOP-DEL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; LOOP-DEL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
+; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LOOP-DEL: for.body:
+; LOOP-DEL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; LOOP-DEL-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LOOP-DEL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; LOOP-DEL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; LOOP-DEL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LOOP-DEL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; LOOP-DEL: for.cond.cleanup:
+; LOOP-DEL-NEXT: ret void
+;
+entry:
+ %0 = sub i64 -1, %N
+ %1 = call i64 @llvm.vscale.i64()
+ %2 = mul i64 %1, 4
+ %3 = icmp ult i64 %0, %2
+ br i1 %3, label %scalar.ph, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %4 = call i64 @llvm.vscale.i64()
+ %5 = mul i64 %4, 4
+ %6 = call i64 @llvm.vscale.i64()
+ %7 = mul i64 %6, 4
+ %8 = sub i64 %7, 1
+ %n.rnd.up = add i64 %N, %8
+ %n.mod.vf = urem i64 %n.rnd.up, %5
+ %n.vec = sub i64 %n.rnd.up, %n.mod.vf
+ %9 = call i64 @llvm.vscale.i64()
+ %10 = mul i64 %9, 4
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+ %11 = sub i64 %N, %evl.based.iv
+ %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true)
+ %13 = add i64 %evl.based.iv, 0
+ %14 = getelementptr inbounds i32, ptr %b, i64 %13
+ %15 = getelementptr inbounds i32, ptr %14, i32 0
+ %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> splat (i1 true), i32 %12)
+ %18 = add nsw <vscale x 4 x i32> %c, %vp.op.load
+ %19 = getelementptr inbounds i32, ptr %a, i64 %13
+ %20 = getelementptr inbounds i32, ptr %19, i32 0
+ call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> splat (i1 true), i32 %12)
+ %21 = zext i32 %12 to i64
+ %index.evl.next = add i64 %21, %evl.based.iv
+ %index.next = add nuw i64 %index, %10
+ %22 = icmp eq i64 %index.next, %n.vec
+ br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
+
+middle.block: ; preds = %vector.body
+ br i1 true, label %for.cond.cleanup, label %scalar.ph
+
+scalar.ph: ; preds = %entry, %middle.block
+ %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ]
+ br label %for.body
+
+for.body: ; preds = %for.body, %scalar.ph
+ %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+ %23 = load i32, ptr %arrayidx, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %23, ptr %arrayidx4, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+
+for.cond.cleanup: ; preds = %middle.block, %for.body
+ ret void
+}
+
+; Fixed IV steps resulting from vscale_range with a single element
+
+define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
+; CHECK-LABEL: define void @fixed_iv_step(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK: for.end.loopexit5:
+; CHECK-NEXT: br label [[FOR_END:%.*]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @fixed_iv_step(
+; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL: vector.body:
+; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]]
+; LOOP-DEL: for.end:
+; LOOP-DEL-NEXT: ret void
+;
+entry:
+ br label %vector.ph
+
+vector.ph:
+ %n.rnd.up = add nsw i64 %N, 15
+ %n.vec = and i64 %n.rnd.up, -16
+ %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %arg0, i64 0
+ %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body:
+ %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ]
+ %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+ %41 = sub i64 %N, %evl.based.iv
+ %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
+ %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
+ tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> splat (i1 true), i32 %42)
+ %43 = zext i32 %42 to i64
+ %index.evl.next = add i64 %evl.based.iv, %43
+ %lsr.iv.next33 = add i64 %lsr.iv32, -16
+ %44 = icmp eq i64 %lsr.iv.next33, 0
+ br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3
+
+for.end.loopexit5:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; Fixed IV step and trip count
+define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
+; CHECK-LABEL: define void @fixed_iv_step_tc(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
+; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK: for.end.loopexit5:
+; CHECK-NEXT: br label [[FOR_END:%.*]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+; LOOP-DEL-LABEL: define void @fixed_iv_step_tc(
+; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] {
+; LOOP-DEL-NEXT: entry:
+; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL: vector.body:
+; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
+; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
+; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]]
+; LOOP-DEL: for.end:
+; LOOP-DEL-NEXT: ret void
+;
+entry:
+ br label %vector.ph
+
+vector.ph:
+ %n.rnd.up = add nsw i64 87, 15
+ %n.vec = and i64 %n.rnd.up, -16
+ %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %arg0, i64 0
+ %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body:
+ %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ]
+ %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+ %41 = sub i64 87, %evl.based.iv
+ %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
+ %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
+ tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> splat (i1 true), i32 %42)
+ %43 = zext i32 %42 to i64
+ %index.evl.next = add i64 %evl.based.iv, %43
+ %lsr.iv.next33 = add i64 %lsr.iv32, -16
+ %44 = icmp eq i64 %lsr.iv.next33, 0
+ br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3
+
+for.end.loopexit5:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg)
+
+declare <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr nocapture, <vscale x 4 x i1>, i32)
+
+declare void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, <vscale x 4 x i1>, i32)
+
+attributes #0 = { vscale_range(8,8) }
+
+!0 = distinct !{!0, !1, !2, !4}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
+!2 = !{!"llvm.loop.unroll.runtime.disable"}
+!3 = distinct !{!3, !2, !1, !4}
+!4 = !{!"llvm.loop.isvectorized.withevl", i32 1}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.withevl", i32 1}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]}
+;.
+; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.withevl", i32 1}
+; LOOP-DEL: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]}
+;.
>From 9b41ae3d4f526a126eb78b6c42072544af56b7d7 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 31 Mar 2025 13:24:41 -0700
Subject: [PATCH 2/5] !fixup Updated with the latest EVL-vectorized metadata
---
llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp | 8 ++++++--
llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll | 6 +++---
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 8ffe287c183f1..6314f3ecf4f07 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -97,8 +97,12 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
if (!EnableEVLIndVarSimplify)
return false;
- if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized") ||
- !getBooleanLoopAttribute(&L, "llvm.loop.isvectorized.withevl"))
+ if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized"))
+ return false;
+ const MDOperand *EVLMD =
+ findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle")
+ .value_or(nullptr);
+ if (!EVLMD || !EVLMD->equalsStr("evl"))
return false;
BasicBlock *LatchBlock = L.getLoopLatch();
diff --git a/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
index 0583214855e52..4de0e666149f3 100644
--- a/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
+++ b/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
@@ -317,17 +317,17 @@ attributes #0 = { vscale_range(8,8) }
!1 = !{!"llvm.loop.isvectorized", i32 1}
!2 = !{!"llvm.loop.unroll.runtime.disable"}
!3 = distinct !{!3, !2, !1, !4}
-!4 = !{!"llvm.loop.isvectorized.withevl", i32 1}
+!4 = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.withevl", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]}
;.
; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.withevl", i32 1}
+; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
; LOOP-DEL: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]}
;.
>From f21428d120bee44792f6b392dc30ba411a822d0f Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Wed, 7 May 2025 10:29:19 -0700
Subject: [PATCH 3/5] fixup! Add some optimization remarks
---
.../Vectorize/EVLIndVarSimplify.cpp | 47 +++++++++++++++----
1 file changed, 39 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 6314f3ecf4f07..0f4c061845f26 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -43,19 +44,19 @@ static cl::opt<bool> EnableEVLIndVarSimplify(
namespace {
struct EVLIndVarSimplifyImpl {
ScalarEvolution &SE;
+ OptimizationRemarkEmitter *ORE = nullptr;
- explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
- : SE(LAR.SE) {}
-
- explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}
+ EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR,
+ OptimizationRemarkEmitter *ORE)
+ : SE(LAR.SE), ORE(ORE) {}
// Returns true if modify the loop.
bool run(Loop &L);
};
} // anonymous namespace
-// Returns the constant part of vectorization factor from the induction
-// variable's step value SCEV expression.
+/// Returns the constant part of vectorization factor from the induction
+/// variable's step value SCEV expression.
static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
if (!Step)
return 0U;
@@ -113,8 +114,17 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
InductionDescriptor IVD;
PHINode *IndVar = L.getInductionVariable(SE);
if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
+ const char *Reason = (IndVar ? "induction descriptor is not available"
+ : "cannot recognize induction variable");
LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
- << "\n");
+ << " because" << Reason << "\n");
+ if (ORE) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MissingIndVar",
+ L.getStartLoc(), L.getHeader())
+ << "Cannot retrieve IV because " << ore::NV("Reason", Reason);
+ });
+ }
return false;
}
@@ -205,6 +215,22 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
return false;
LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+ if (ORE) {
+ ORE->emit([&]() {
+ DebugLoc DL;
+ BasicBlock *Region = nullptr;
+ if (auto *I = dyn_cast<Instruction>(EVLIndVar)) {
+ DL = I->getDebugLoc();
+ Region = I->getParent();
+ } else {
+ DL = L.getStartLoc();
+ Region = L.getHeader();
+ }
+ return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region)
+ << "Using " << ore::NV("EVLIndVar", EVLIndVar)
+ << " for EVL-based IndVar";
+ });
+ }
// Create an EVL-based comparison and replace the branch to use it as
// predicate.
@@ -240,7 +266,12 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
- if (EVLIndVarSimplifyImpl(AR).run(L))
+ Function &F = *L.getHeader()->getParent();
+ auto &FAMProxy = LAM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR);
+ OptimizationRemarkEmitter *ORE =
+ FAMProxy.getCachedResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ if (EVLIndVarSimplifyImpl(AR, ORE).run(L))
return PreservedAnalyses::allInSet<CFGAnalyses>();
return PreservedAnalyses::all();
}
>From 896009b8810f6e5ab10a469d5308fa42dfdcb006 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Fri, 9 May 2025 10:24:34 -0700
Subject: [PATCH 4/5] fixup! Address review comments
And add more optimization remarks
---
.../Vectorize/EVLIndVarSimplify.cpp | 38 +++++++++++++++----
.../{ => RISCV}/evl-iv-simplify.ll | 0
2 files changed, 31 insertions(+), 7 deletions(-)
rename llvm/test/Transforms/LoopVectorize/{ => RISCV}/evl-iv-simplify.ll (100%)
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 0f4c061845f26..659ea6e6ca0e2 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -63,13 +63,14 @@ static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
// Looking for loops with IV step value in the form of `(<constant VF> x
// vscale)`.
- if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
+ if (const auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
if (Mul->getNumOperands() == 2) {
const SCEV *LHS = Mul->getOperand(0);
const SCEV *RHS = Mul->getOperand(1);
- if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
+ if (const auto *Const = dyn_cast<SCEVConstant>(LHS);
+ Const && isa<SCEVVScale>(RHS)) {
uint64_t V = Const->getAPInt().getLimitedValue();
- if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
+ if (llvm::isUInt<32>(V))
return V;
}
}
@@ -78,7 +79,7 @@ static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
// If not, see if the vscale_range of the parent function is a fixed value,
// which makes the step value to be replaced by a constant.
if (F.hasFnAttribute(Attribute::VScaleRange))
- if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
+ if (const auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
APInt V = ConstStep->getAPInt().abs();
ConstantRange CR = llvm::getVScaleRange(&F, 64);
if (const APInt *Fixed = CR.getSingleElement()) {
@@ -120,7 +121,7 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
<< " because" << Reason << "\n");
if (ORE) {
ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MissingIndVar",
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar",
L.getStartLoc(), L.getHeader())
<< "Cannot retrieve IV because " << ore::NV("Reason", Reason);
});
@@ -132,6 +133,13 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
<< L.getName() << "\n");
+ if (ORE) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure",
+ L.getStartLoc(), L.getHeader())
+ << "Does not have a unique incoming and backedge";
+ });
+ }
return false;
}
@@ -140,6 +148,13 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
if (!Bounds) {
LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
<< "\n");
+ if (ORE) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure",
+ L.getStartLoc(), L.getHeader())
+ << "Could not obtain the loop bounds";
+ });
+ }
return false;
}
Value *CanonicalIVInit = &Bounds->getInitialIVValue();
@@ -150,6 +165,14 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
if (!VF) {
LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
<< "'\n");
+ if (ORE) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar",
+ L.getStartLoc(), L.getHeader())
+ << "Could not infer VF from IndVar step "
+ << ore::NV("Step", StepV);
+ });
+ }
return false;
}
LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
@@ -196,7 +219,7 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
break;
}
Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
- assert(RecValue);
+ assert(RecValue && "expect recurrent IndVar value");
LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
<< "\n");
@@ -238,7 +261,8 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
// Loop::getLatchCmpInst check at the beginning of this function has ensured
// that latch block ends in a conditional branch.
auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator());
- assert(LatchBranch->isConditional());
+ assert(LatchBranch->isConditional() &&
+ "expect the loop latch to be ended with a conditional branch");
ICmpInst::Predicate Pred;
if (LatchBranch->getSuccessor(0) == L.getHeader())
Pred = ICmpInst::ICMP_NE;
diff --git a/llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
similarity index 100%
rename from llvm/test/Transforms/LoopVectorize/evl-iv-simplify.ll
rename to llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
>From eeddd6d162e2332acd7c96b6f1211e83459b1da6 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Tue, 13 May 2025 10:46:59 -0700
Subject: [PATCH 5/5] fixup! Address review comments
---
llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 659ea6e6ca0e2..4a1fb095bae35 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -50,7 +50,7 @@ struct EVLIndVarSimplifyImpl {
OptimizationRemarkEmitter *ORE)
: SE(LAR.SE), ORE(ORE) {}
- // Returns true if modify the loop.
+ /// Returns true if modify the loop.
bool run(Loop &L);
};
} // anonymous namespace
@@ -188,7 +188,7 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
m_Value(RemTC), m_SpecificInt(VF),
/*Scalable=*/m_SpecificInt(1));
- for (auto &PN : BB->phis()) {
+ for (PHINode &PN : BB->phis()) {
if (&PN == IndVar)
continue;
More information about the llvm-commits
mailing list