[llvm] [LV] Introduce the EVLIVSimplify Pass for EVL-vectorized loops (PR #91796)

Mon Dec 23 15:22:00 PST 2024

https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/91796

>From 157f99515618ee12758376d4625172d917281a25 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Fri, 10 May 2024 11:27:40 -0700
Subject: [PATCH 01/11] [LV] Introduce the EVLIVSimplify Pass for
 EVL-vectorized loops

When we enable EVL-based loop vectorization w/ predicated tail-folding,
each vectorized loop has effectively two induction variables: one
calculates the step using (VF x vscale) and the other one increases the
IV by values returned from experiment.get.vector.length. The former,
also known as canonical IV, is more favorable for analyses as it's "countable"
in the sense of SCEV; the latter (EVL-based IV), however, is more favorable to
codegen, at least for those that support scalable vectors like AArch64 SVE and
RISC-V.

The idea is that we use canonical IV all the way until the beginning of codegen
pipeline, where we replace it with EVL-based IV using EVLIVSimplify
introduced here. Such that we can have the best from both worlds.

This Pass is enabled by default in RISC-V. However, since we haven't
really vectorize loops with predicate tail-folding, this Pass is
no-op at this moment.

That said, I have validate the correctness of this Pass by enable
EVL-based LV + predicated tail-folding
(i.e. -force-tail-folding-style=data-with-evl
-prefer-predicate-over-epilogue=predicate-dont-vectorize) and run on
SPEC2006INT and SPEC2017 intrate w/ test workload.
---
 llvm/include/llvm/InitializePasses.h          |   1 +
 .../Transforms/Vectorize/EVLIndVarSimplify.h  |  34 +++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   4 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Vectorize/EVLIndVarSimplify.cpp           | 253 ++++++++++++++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   3 +
 .../LoopVectorize/RISCV/evl-iv-simplify.ll    | 282 ++++++++++++++++++
 10 files changed, 581 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
 create mode 100644 llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index e50cb0dd7541a2..f10124216d6e8d 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -112,6 +112,7 @@ void initializeExpandReductionsPass(PassRegistry &);
 void initializeExpandVariadicsPass(PassRegistry &);
 void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeExternalAAWrapperPassPass(PassRegistry &);
+void initializeEVLIndVarSimplifyPass(PassRegistry &);
 void initializeFEntryInserterPass(PassRegistry &);
 void initializeFinalizeISelPass(PassRegistry &);
 void initializeFinalizeMachineBundlesPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
new file mode 100644
index 00000000000000..9b1c207439f8a4
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
@@ -0,0 +1,34 @@
+//===-------- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Loop;
+class LPMUpdater;
+class Pass;
+
+/// Turn vectorized loops with canonical induction variables into loops that
+/// only use a single EVL-based induction variable.
+struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+Pass *createEVLIndVarSimplifyPass();
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 60ab33bee704c1..cd6b4d564c941e 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -323,6 +323,7 @@
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 17710eb94b6ded..c7ca46af1d8f77 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -140,6 +140,7 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 549c1359b5852c..0be563c631f519 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -645,6 +645,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch",
 #endif
 LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("dot-ddg", DDGDotPrinterPass())
+LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 089dc6c529193d..66baaa317fa278 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <optional>
 using namespace llvm;
@@ -446,6 +447,9 @@ void RISCVPassConfig::addIRPasses() {
   }
 
   TargetPassConfig::addIRPasses();
+
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createEVLIndVarSimplifyPass());
 }
 
 bool RISCVPassConfig::addPreISel() {
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index d769d5100afd23..956f3c240ee425 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_component_library(LLVMVectorize
+  EVLIndVarSimplify.cpp
   LoadStoreVectorizer.cpp
   LoopIdiomVectorize.cpp
   LoopVectorizationLegality.cpp
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
new file mode 100644
index 00000000000000..21c453925cd76a
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -0,0 +1,253 @@
+//===------ EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+#define DEBUG_TYPE "evl-iv-simplify"
+
+using namespace llvm;
+
+STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
+
+namespace {
+struct EVLIndVarSimplifyImpl {
+  ScalarEvolution &SE;
+
+  explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
+      : SE(LAR.SE) {}
+
+  explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}
+
+  // Returns true if modify the loop.
+  bool run(Loop &L);
+};
+
+struct EVLIndVarSimplify : public LoopPass {
+  static char ID;
+
+  EVLIndVarSimplify() : LoopPass(ID) {
+    initializeEVLIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // anonymous namespace
+
+static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
+                                               const Function &F) {
+  if (!Step)
+    return std::nullopt;
+
+  // Looking for loops with IV step value in the form of `(<constant VF> x
+  // vscale)`.
+  if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
+    if (Mul->getNumOperands() == 2) {
+      const SCEV *LHS = Mul->getOperand(0);
+      const SCEV *RHS = Mul->getOperand(1);
+      if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
+        uint64_t V = Const->getAPInt().getLimitedValue();
+        if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
+          return static_cast<uint32_t>(V);
+      }
+    }
+  }
+
+  // If not, see if the vscale_range of the parent function is a fixed value,
+  // which makes the step value to be replaced by a constant.
+  if (isa<SCEVConstant>(Step) && F.hasFnAttribute(Attribute::VScaleRange)) {
+    APInt V = cast<SCEVConstant>(Step)->getAPInt().abs();
+    ConstantRange CR = llvm::getVScaleRange(&F, 64);
+    if (const APInt *Fixed = CR.getSingleElement()) {
+      V = V.zextOrTrunc(Fixed->getBitWidth());
+      uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+      if (VF && llvm::isUInt<32>(VF))
+        return static_cast<uint32_t>(VF);
+    }
+  }
+
+  return std::nullopt;
+}
+
+// Remove the original induction variable if it's not used anywhere.
+static void cleanupOriginalIndVar(PHINode *OrigIndVar, BasicBlock *InitBlock,
+                                  BasicBlock *BackEdgeBlock) {
+  Value *InitValue = OrigIndVar->getIncomingValueForBlock(InitBlock);
+  Value *RecValue = OrigIndVar->getIncomingValueForBlock(BackEdgeBlock);
+
+  // If the only user of OrigIndVar is the one produces RecValue, then we can
+  // safely remove it.
+  if (!OrigIndVar->hasOneUse() || OrigIndVar->user_back() != RecValue)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Removed the original IndVar " << *OrigIndVar << "\n");
+  // Remove OrigIndVar by replacing all its uses by the initial value of this
+  // loop. Then DCE will take care of the rest.
+  OrigIndVar->replaceAllUsesWith(InitValue);
+  OrigIndVar->eraseFromParent();
+}
+
+bool EVLIndVarSimplifyImpl::run(Loop &L) {
+  InductionDescriptor IVD;
+  PHINode *IndVar = L.getInductionVariable(SE);
+  if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
+    LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+
+  BasicBlock *InitBlock, *BackEdgeBlock;
+  if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
+    LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
+                      << L.getName() << "\n");
+    return false;
+  }
+
+  // Retrieve the loop bounds.
+  std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
+  if (!Bounds) {
+    LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+  Value *CanonicalIVInit = &Bounds->getInitialIVValue();
+  Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
+
+  const SCEV *StepV = IVD.getStep();
+  auto VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
+  if (!VF) {
+    LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
+                      << "'\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Using VF=" << *VF << " for loop " << L.getName()
+                    << "\n");
+
+  // Try to find the EVL-based induction variable.
+  using namespace PatternMatch;
+  BasicBlock *BB = IndVar->getParent();
+
+  Value *EVLIndex = nullptr;
+  Value *RemVL = nullptr, *AVL = nullptr;
+  auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
+      m_Value(RemVL), m_SpecificInt(*VF),
+      /*Scalable=*/m_SpecificInt(1));
+  for (auto &PN : BB->phis()) {
+    if (&PN == IndVar)
+      continue;
+
+    // Check 1: it has to contain both incoming (init) & backedge blocks
+    // from IndVar.
+    if (PN.getBasicBlockIndex(InitBlock) < 0 ||
+        PN.getBasicBlockIndex(BackEdgeBlock) < 0)
+      continue;
+    // Check 2: EVL index is always increasing, thus its inital value has to be
+    // equal to either the initial IV value (when the canonical IV is also
+    // increasing) or the last IV value (when canonical IV is decreasing).
+    Value *Init = PN.getIncomingValueForBlock(InitBlock);
+    using Direction = Loop::LoopBounds::Direction;
+    switch (Bounds->getDirection()) {
+    case Direction::Increasing:
+      if (Init != CanonicalIVInit)
+        continue;
+      break;
+    case Direction::Decreasing:
+      if (Init != CanonicalIVFinal)
+        continue;
+      break;
+    case Direction::Unknown:
+      // To be more permissive and see if either the initial or final IV value
+      // matches PN's init value.
+      if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
+        continue;
+      break;
+    }
+    Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
+    assert(RecValue);
+
+    LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
+                      << "\n");
+
+    // Check 3: Pattern match to find the EVL-based index and total trip count
+    // (AVL).
+    if (match(RecValue,
+              m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
+        match(RemVL, m_Sub(m_Value(AVL), m_Specific(&PN)))) {
+      EVLIndex = RecValue;
+      break;
+    }
+  }
+
+  if (!EVLIndex || !AVL)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndex << " for EVL-based IndVar\n");
+
+  // Create an EVL-based comparison and replace the branch to use it as
+  // predicate.
+  ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
+  ICmpInst::Predicate Pred = OrigLatchCmp->getPredicate();
+  if (!ICmpInst::isEquality(Pred))
+    return false;
+
+  IRBuilder<> Builder(OrigLatchCmp);
+  auto *NewPred = Builder.CreateICmp(Pred, EVLIndex, AVL);
+  OrigLatchCmp->replaceAllUsesWith(NewPred);
+
+  cleanupOriginalIndVar(IndVar, InitBlock, BackEdgeBlock);
+
+  ++NumEliminatedCanonicalIV;
+
+  return true;
+}
+
+PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
+                                             LoopStandardAnalysisResults &AR,
+                                             LPMUpdater &U) {
+  if (EVLIndVarSimplifyImpl(AR).run(L))
+    return PreservedAnalyses::allInSet<CFGAnalyses>();
+  return PreservedAnalyses::all();
+}
+
+char EVLIndVarSimplify::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EVLIndVarSimplify, DEBUG_TYPE,
+                      "EVL-based Induction Variables Simplify", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(EVLIndVarSimplify, DEBUG_TYPE,
+                    "EVL-based Induction Variables Simplify", false, false)
+
+bool EVLIndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  return EVLIndVarSimplifyImpl(SE).run(*L);
+}
+
+Pass *llvm::createEVLIndVarSimplifyPass() { return new EVLIndVarSimplify(); }
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index c29f15a15c1503..d749eff9b20c5b 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -69,6 +69,9 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         EVL-based Induction Variables Simplify
 ; CHECK-NEXT:       Type Promotion
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
new file mode 100644
index 00000000000000..5db92fa7255f24
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: define void @simple(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 0, [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @simple(
+; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
+; LOOP-DEL:       vector.ph:
+; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL:       vector.body:
+; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true)
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; LOOP-DEL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LOOP-DEL:       for.body:
+; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; LOOP-DEL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; LOOP-DEL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; LOOP-DEL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; LOOP-DEL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LOOP-DEL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; LOOP-DEL:       for.cond.cleanup:
+; LOOP-DEL-NEXT:    ret void
+;
+entry:
+  %0 = sub i64 -1, %N
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = mul i64 %1, 4
+  %3 = icmp ult i64 %0, %2
+  br i1 %3, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = mul i64 %4, 4
+  %6 = call i64 @llvm.vscale.i64()
+  %7 = mul i64 %6, 4
+  %8 = sub i64 %7, 1
+  %n.rnd.up = add i64 %N, %8
+  %n.mod.vf = urem i64 %n.rnd.up, %5
+  %n.vec = sub i64 %n.rnd.up, %n.mod.vf
+  %9 = call i64 @llvm.vscale.i64()
+  %10 = mul i64 %9, 4
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+  %11 = sub i64 %N, %evl.based.iv
+  %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true)
+  %13 = add i64 %evl.based.iv, 0
+  %14 = getelementptr inbounds i32, ptr %b, i64 %13
+  %15 = getelementptr inbounds i32, ptr %14, i32 0
+  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %16 = getelementptr inbounds i32, ptr %c, i64 %13
+  %17 = getelementptr inbounds i32, ptr %16, i32 0
+  %vp.op.load1 = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %17, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %18 = add nsw <vscale x 4 x i32> %vp.op.load1, %vp.op.load
+  %19 = getelementptr inbounds i32, ptr %a, i64 %13
+  %20 = getelementptr inbounds i32, ptr %19, i32 0
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %21 = zext i32 %12 to i64
+  %index.evl.next = add i64 %21, %evl.based.iv
+  %index.next = add i64 %index, %10
+  %22 = icmp eq i64 %index.next, %n.vec
+  br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
+
+middle.block:                                     ; preds = %vector.body
+  br i1 true, label %for.cond.cleanup, label %scalar.ph
+
+scalar.ph:                                        ; preds = %entry, %middle.block
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %scalar.ph
+  %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %23 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %24 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %24, %23
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+
+for.cond.cleanup:                                 ; preds = %middle.block, %for.body
+  ret void
+}
+
+; Fixed IV steps resulting from vscale_range with a single element
+
+define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
+; CHECK-LABEL: define void @fixed_iv_step(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nsw i64 [[N]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; CHECK-NEXT:    [[LSR_IV_NEXT33:%.*]] = add i64 [[N_VEC]], -16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[LSR_IV_NEXT33]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK:       for.end.loopexit5:
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @fixed_iv_step(
+; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL:       vector.body:
+; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; LOOP-DEL-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL:       for.end:
+; LOOP-DEL-NEXT:    ret void
+;
+entry:
+  br label %vector.ph
+
+vector.ph:
+  %n.rnd.up = add nsw i64 %N, 15
+  %n.vec = and i64 %n.rnd.up, -16
+  %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %arg0, i64 0
+  %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ]
+  %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+  %41 = sub i64 %N, %evl.based.iv
+  %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
+  %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
+  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %42)
+  %43 = zext i32 %42 to i64
+  %index.evl.next = add i64 %evl.based.iv, %43
+  %lsr.iv.next33 = add i64 %lsr.iv32, -16
+  %44 = icmp eq i64 %lsr.iv.next33, 0
+  br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3
+
+for.end.loopexit5:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg)
+
+declare <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr nocapture, <vscale x 4 x i1>, i32)
+
+declare void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, <vscale x 4 x i1>, i32)
+
+attributes #0 = { vscale_range(8,8) }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
+!2 = !{!"llvm.loop.unroll.runtime.disable"}
+!3 = distinct !{!3, !2, !1}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LOOP-DEL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 855cca90ac730bf5f6163a284603284cd53c7cc2 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 13 May 2024 10:41:45 -0700
Subject: [PATCH 02/11] Address review comments

And simplify the test cases.
---
 .../Vectorize/EVLIndVarSimplify.cpp           | 19 ++++----
 .../LoopVectorize/RISCV/evl-iv-simplify.ll    | 46 ++++++-------------
 2 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 21c453925cd76a..ba9a707dbea44e 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -83,16 +83,17 @@ static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
 
   // If not, see if the vscale_range of the parent function is a fixed value,
   // which makes the step value to be replaced by a constant.
-  if (isa<SCEVConstant>(Step) && F.hasFnAttribute(Attribute::VScaleRange)) {
-    APInt V = cast<SCEVConstant>(Step)->getAPInt().abs();
-    ConstantRange CR = llvm::getVScaleRange(&F, 64);
-    if (const APInt *Fixed = CR.getSingleElement()) {
-      V = V.zextOrTrunc(Fixed->getBitWidth());
-      uint64_t VF = V.udiv(*Fixed).getLimitedValue();
-      if (VF && llvm::isUInt<32>(VF))
-        return static_cast<uint32_t>(VF);
+  if (F.hasFnAttribute(Attribute::VScaleRange))
+    if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
+      APInt V = ConstStep->getAPInt().abs();
+      ConstantRange CR = llvm::getVScaleRange(&F, 64);
+      if (const APInt *Fixed = CR.getSingleElement()) {
+        V = V.zextOrTrunc(Fixed->getBitWidth());
+        uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+        if (VF && llvm::isUInt<32>(VF))
+          return static_cast<uint32_t>(VF);
+      }
     }
-  }
 
   return std::nullopt;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
index 5db92fa7255f24..72cb30f270f249 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
@@ -2,9 +2,9 @@
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
 
-define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %N) {
 ; CHECK-LABEL: define void @simple(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
@@ -29,12 +29,9 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
@@ -52,10 +49,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]]
+; CHECK-NEXT:    [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -67,7 +61,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    ret void
 ;
 ; LOOP-DEL-LABEL: define void @simple(
-; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; LOOP-DEL-NEXT:  entry:
 ; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; LOOP-DEL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
@@ -82,12 +76,9 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; LOOP-DEL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true)
 ; LOOP-DEL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; LOOP-DEL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
-; LOOP-DEL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; LOOP-DEL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
-; LOOP-DEL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
-; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
-; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; LOOP-DEL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
 ; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
@@ -98,10 +89,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; LOOP-DEL:       for.body:
 ; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; LOOP-DEL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; LOOP-DEL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; LOOP-DEL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; LOOP-DEL-NEXT:    [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; LOOP-DEL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; LOOP-DEL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; LOOP-DEL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -138,14 +126,11 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %13 = add i64 %evl.based.iv, 0
   %14 = getelementptr inbounds i32, ptr %b, i64 %13
   %15 = getelementptr inbounds i32, ptr %14, i32 0
-  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
-  %16 = getelementptr inbounds i32, ptr %c, i64 %13
-  %17 = getelementptr inbounds i32, ptr %16, i32 0
-  %vp.op.load1 = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %17, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
-  %18 = add nsw <vscale x 4 x i32> %vp.op.load1, %vp.op.load
+  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> splat (i1 true), i32 %12)
+  %18 = add nsw <vscale x 4 x i32> %c, %vp.op.load
   %19 = getelementptr inbounds i32, ptr %a, i64 %13
   %20 = getelementptr inbounds i32, ptr %19, i32 0
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> splat (i1 true), i32 %12)
   %21 = zext i32 %12 to i64
   %index.evl.next = add i64 %21, %evl.based.iv
   %index.next = add i64 %index, %10
@@ -163,11 +148,8 @@ for.body:                                         ; preds = %for.body, %scalar.p
   %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ]
   %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
   %23 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %24 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %24, %23
   %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx4, align 4
+  store i32 %23, ptr %arrayidx4, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3
@@ -241,7 +223,7 @@ vector.body:
   %41 = sub i64 %N, %evl.based.iv
   %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
   %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
-  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %42)
+  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> splat (i1 true), i32 %42)
   %43 = zext i32 %42 to i64
   %index.evl.next = add i64 %evl.based.iv, %43
   %lsr.iv.next33 = add i64 %lsr.iv32, -16

>From 6bc5c94dfbb6e54baf098fae0a123be2e7f966ed Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 16 May 2024 11:09:46 -0700
Subject: [PATCH 03/11] Move EVLIndVarSimplify to CodeGen

---
 .../{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.h   | 6 +++---
 llvm/lib/CodeGen/CMakeLists.txt                             | 1 +
 .../{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.cpp | 2 +-
 llvm/lib/Passes/PassBuilder.cpp                             | 2 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp                    | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp                | 2 +-
 llvm/lib/Transforms/Vectorize/CMakeLists.txt                | 1 -
 .../LoopVectorize => CodeGen}/RISCV/evl-iv-simplify.ll      | 0
 8 files changed, 8 insertions(+), 8 deletions(-)
 rename llvm/include/llvm/{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.h (84%)
 rename llvm/lib/{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.cpp (99%)
 rename llvm/test/{Transforms/LoopVectorize => CodeGen}/RISCV/evl-iv-simplify.ll (100%)

diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
similarity index 84%
rename from llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
rename to llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
index 9b1c207439f8a4..88549d443b8e66 100644
--- a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
+++ b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
@@ -1,4 +1,4 @@
-//===-------- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV----===//
+//===- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV-*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
-#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+#ifndef LLVM_CODEGEN_EVLINDVARSIMPLIFY_H
+#define LLVM_CODEGEN_EVLINDVARSIMPLIFY_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 5a17944db0ae03..6e95693449ee32 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_component_library(LLVMCodeGen
   EarlyIfConversion.cpp
   EdgeBundles.cpp
   EHContGuardCatchret.cpp
+  EVLIndVarSimplify.cpp
   ExecutionDomainFix.cpp
   ExpandLargeDivRem.cpp
   ExpandLargeFpConvert.cpp
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
similarity index 99%
rename from llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
rename to llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index ba9a707dbea44e..b7a79be998d12b 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index cd6b4d564c941e..899e288acb2b9d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -84,6 +84,7 @@
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/EarlyIfConversion.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
@@ -323,7 +324,6 @@
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c7ca46af1d8f77..d6412bf20cec8f 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/OptimizationLevel.h"
@@ -140,7 +141,6 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 66baaa317fa278..2be7d675a5508d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -19,6 +19,7 @@
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -39,7 +40,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <optional>
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 956f3c240ee425..d769d5100afd23 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_component_library(LLVMVectorize
-  EVLIndVarSimplify.cpp
   LoadStoreVectorizer.cpp
   LoopIdiomVectorize.cpp
   LoopVectorizationLegality.cpp
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
similarity index 100%
rename from llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
rename to llvm/test/CodeGen/RISCV/evl-iv-simplify.ll

>From 14f4a7ec4972e49b041ae2b25eae945d91382d40 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 20 May 2024 13:56:42 -0700
Subject: [PATCH 04/11] Address review comments

---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp | 52 ++++++++++++++++----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index b7a79be998d12b..af685f65ff3a4c 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,6 +34,11 @@ using namespace llvm;
 
 STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
 
+static cl::opt<bool> EnableEVLIndVarSimplify(
+    "enable-evl-indvar-simplify",
+    cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden,
+    cl::init(true));
+
 namespace {
 struct EVLIndVarSimplifyImpl {
   ScalarEvolution &SE;
@@ -62,10 +68,9 @@ struct EVLIndVarSimplify : public LoopPass {
 };
 } // anonymous namespace
 
-static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
-                                               const Function &F) {
+static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
   if (!Step)
-    return std::nullopt;
+    return 0U;
 
   // Looking for loops with IV step value in the form of `(<constant VF> x
   // vscale)`.
@@ -95,14 +100,18 @@ static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
       }
     }
 
-  return std::nullopt;
+  return 0U;
 }
 
 // Remove the original induction variable if it's not used anywhere.
-static void cleanupOriginalIndVar(PHINode *OrigIndVar, BasicBlock *InitBlock,
-                                  BasicBlock *BackEdgeBlock) {
-  Value *InitValue = OrigIndVar->getIncomingValueForBlock(InitBlock);
-  Value *RecValue = OrigIndVar->getIncomingValueForBlock(BackEdgeBlock);
+static void tryCleanupOriginalIndVar(PHINode *OrigIndVar,
+                                     const InductionDescriptor &IVD) {
+  if (OrigIndVar->getNumIncomingValues() != 2)
+    return;
+  Value *InitValue = OrigIndVar->getIncomingValue(0);
+  Value *RecValue = OrigIndVar->getIncomingValue(1);
+  if (InitValue != IVD.getStartValue())
+    std::swap(InitValue, RecValue);
 
   // If the only user of OrigIndVar is the one produces RecValue, then we can
   // safely remove it.
@@ -117,6 +126,9 @@ static void cleanupOriginalIndVar(PHINode *OrigIndVar, BasicBlock *InitBlock,
 }
 
 bool EVLIndVarSimplifyImpl::run(Loop &L) {
+  if (!EnableEVLIndVarSimplify)
+    return false;
+
   InductionDescriptor IVD;
   PHINode *IndVar = L.getInductionVariable(SE);
   if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
@@ -143,23 +155,23 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
 
   const SCEV *StepV = IVD.getStep();
-  auto VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
+  uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
   if (!VF) {
     LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
                       << "'\n");
     return false;
   }
-  LLVM_DEBUG(dbgs() << "Using VF=" << *VF << " for loop " << L.getName()
+  LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
                     << "\n");
 
   // Try to find the EVL-based induction variable.
   using namespace PatternMatch;
   BasicBlock *BB = IndVar->getParent();
 
-  Value *EVLIndex = nullptr;
-  Value *RemVL = nullptr, *AVL = nullptr;
+  Value *EVLIndVar = nullptr;
+  Value *RemTC = nullptr, *TC = nullptr;
   auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
-      m_Value(RemVL), m_SpecificInt(*VF),
+      m_Value(RemTC), m_SpecificInt(VF),
       /*Scalable=*/m_SpecificInt(1));
   for (auto &PN : BB->phis()) {
     if (&PN == IndVar)
@@ -198,19 +210,19 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
                       << "\n");
 
     // Check 3: Pattern match to find the EVL-based index and total trip count
-    // (AVL).
+    // (TC).
     if (match(RecValue,
               m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
-        match(RemVL, m_Sub(m_Value(AVL), m_Specific(&PN)))) {
-      EVLIndex = RecValue;
+        match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
+      EVLIndVar = RecValue;
       break;
     }
   }
 
-  if (!EVLIndex || !AVL)
+  if (!EVLIndVar || !TC)
     return false;
 
-  LLVM_DEBUG(dbgs() << "Using " << *EVLIndex << " for EVL-based IndVar\n");
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
 
   // Create an EVL-based comparison and replace the branch to use it as
   // predicate.
@@ -220,10 +232,10 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
     return false;
 
   IRBuilder<> Builder(OrigLatchCmp);
-  auto *NewPred = Builder.CreateICmp(Pred, EVLIndex, AVL);
+  auto *NewPred = Builder.CreateICmp(Pred, EVLIndVar, TC);
   OrigLatchCmp->replaceAllUsesWith(NewPred);
 
-  cleanupOriginalIndVar(IndVar, InitBlock, BackEdgeBlock);
+  tryCleanupOriginalIndVar(IndVar, IVD);
 
   ++NumEliminatedCanonicalIV;
 

>From 3f6b6605e3ade6f4d7d1ba3f741c3fd562720201 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Wed, 22 May 2024 12:21:37 -0700
Subject: [PATCH 05/11] Address review comments

And check if the trip count matches the canonical IV.
---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp     | 40 +++++++++--
 llvm/test/CodeGen/RISCV/evl-iv-simplify.ll | 78 ++++++++++++++++++++++
 2 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index af685f65ff3a4c..8d2e29761d1d9c 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -95,7 +95,9 @@ static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
       if (const APInt *Fixed = CR.getSingleElement()) {
         V = V.zextOrTrunc(Fixed->getBitWidth());
         uint64_t VF = V.udiv(*Fixed).getLimitedValue();
-        if (VF && llvm::isUInt<32>(VF))
+        if (VF && llvm::isUInt<32>(VF) &&
+            // Make sure step is dividable by vscale.
+            V.urem(*Fixed).isZero())
           return static_cast<uint32_t>(VF);
       }
     }
@@ -113,14 +115,14 @@ static void tryCleanupOriginalIndVar(PHINode *OrigIndVar,
   if (InitValue != IVD.getStartValue())
     std::swap(InitValue, RecValue);
 
-  // If the only user of OrigIndVar is the one produces RecValue, then we can
-  // safely remove it.
+  // If the only user of OrigIndVar is the one that produces RecValue, then we
+  // can safely remove it.
   if (!OrigIndVar->hasOneUse() || OrigIndVar->user_back() != RecValue)
     return;
 
   LLVM_DEBUG(dbgs() << "Removed the original IndVar " << *OrigIndVar << "\n");
-  // Remove OrigIndVar by replacing all its uses by the initial value of this
-  // loop. Then DCE will take care of the rest.
+  // Turn OrigIndVar into dead code by replacing all its uses by the initial
+  // value of this loop.
   OrigIndVar->replaceAllUsesWith(InitValue);
   OrigIndVar->eraseFromParent();
 }
@@ -153,6 +155,8 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   }
   Value *CanonicalIVInit = &Bounds->getInitialIVValue();
   Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
+  const SCEV *CanonicalIVInitV = SE.getSCEV(CanonicalIVInit);
+  const SCEV *CanonicalIVFinalV = SE.getSCEV(CanonicalIVFinal);
 
   const SCEV *StepV = IVD.getStep();
   uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
@@ -222,6 +226,29 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   if (!EVLIndVar || !TC)
     return false;
 
+  // Make sure TC is related to the original trip count of the canonical IV.
+  // Specifically, if the canonical trip count is derived from TC.
+  const SCEV *TCV = SE.getSCEV(TC);
+  bool MatchTC = false;
+  if (const auto *ConstTCV = dyn_cast<SCEVConstant>(TCV)) {
+    // If TC is a constant and vscale is also a constant, then the canonical
+    // trip count will be constant. Canonical trip count * Step equals to the
+    // round up of TC.
+    if (const auto *ConstStep = dyn_cast<SCEVConstant>(StepV))
+      if (unsigned CanonicalTC = SE.getSmallConstantTripCount(&L)) {
+        APInt Step = ConstStep->getAPInt().abs().zextOrTrunc(64);
+        APInt CanonicalTripCount(64, CanonicalTC);
+        APInt TripCount = ConstTCV->getAPInt().zextOrTrunc(64);
+        MatchTC = (CanonicalTripCount * Step - TripCount).ult(Step);
+      }
+  }
+  // Otherwise, we simply check if the upper or lower bound expression of the
+  // canonical IV contains TC.
+  auto equalsTC = [&](const SCEV *S) -> bool { return S == TCV; };
+  if (!MatchTC && !llvm::SCEVExprContains(CanonicalIVFinalV, equalsTC) &&
+      !llvm::SCEVExprContains(CanonicalIVInitV, equalsTC))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
 
   // Create an EVL-based comparison and replace the branch to use it as
@@ -259,6 +286,9 @@ INITIALIZE_PASS_END(EVLIndVarSimplify, DEBUG_TYPE,
                     "EVL-based Induction Variables Simplify", false, false)
 
 bool EVLIndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   return EVLIndVarSimplifyImpl(SE).run(*L);
 }
diff --git a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
index 72cb30f270f249..0e319b0be9684b 100644
--- a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
@@ -237,6 +237,84 @@ for.end:
   ret void
 }
 
+; Fixed IV step and trip count
+define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
+; CHECK-LABEL: define void @fixed_iv_step_tc(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nsw i64 87, 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; CHECK-NEXT:    [[LSR_IV_NEXT33:%.*]] = add i64 [[N_VEC]], -16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[LSR_IV_NEXT33]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK:       for.end.loopexit5:
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @fixed_iv_step_tc(
+; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL:       vector.body:
+; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; LOOP-DEL-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL:       for.end:
+; LOOP-DEL-NEXT:    ret void
+;
+entry:
+  br label %vector.ph
+
+vector.ph:
+  %n.rnd.up = add nsw i64 87, 15
+  %n.vec = and i64 %n.rnd.up, -16
+  %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %arg0, i64 0
+  %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ]
+  %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+  %41 = sub i64 87, %evl.based.iv
+  %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
+  %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
+  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> splat (i1 true), i32 %42)
+  %43 = zext i32 %42 to i64
+  %index.evl.next = add i64 %evl.based.iv, %43
+  %lsr.iv.next33 = add i64 %lsr.iv32, -16
+  %44 = icmp eq i64 %lsr.iv.next33, 0
+  br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3
+
+for.end.loopexit5:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
 declare i64 @llvm.vscale.i64()
 
 declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg)

>From 9f8bbd76b1605d85332785ab937305e619650e97 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Wed, 22 May 2024 12:32:06 -0700
Subject: [PATCH 06/11] fixup! Address review comments

---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index 8d2e29761d1d9c..da9602c31285dc 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -96,7 +96,7 @@ static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
         V = V.zextOrTrunc(Fixed->getBitWidth());
         uint64_t VF = V.udiv(*Fixed).getLimitedValue();
         if (VF && llvm::isUInt<32>(VF) &&
-            // Make sure step is dividable by vscale.
+            // Make sure step is divisible by vscale.
             V.urem(*Fixed).isZero())
           return static_cast<uint32_t>(VF);
       }

>From c833e9ec8029d386f7b82ea55594e2f43db8c763 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Tue, 28 May 2024 11:32:14 -0700
Subject: [PATCH 07/11] Use RecursivelyDeleteDeadPHINode instead of cleaning
 IndVar manually

---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp     | 32 ++++++----------------
 llvm/test/CodeGen/RISCV/evl-iv-simplify.ll | 12 --------
 2 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index da9602c31285dc..e97fc578813ae5 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "evl-iv-simplify"
 
@@ -105,28 +106,6 @@ static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
   return 0U;
 }
 
-// Remove the original induction variable if it's not used anywhere.
-static void tryCleanupOriginalIndVar(PHINode *OrigIndVar,
-                                     const InductionDescriptor &IVD) {
-  if (OrigIndVar->getNumIncomingValues() != 2)
-    return;
-  Value *InitValue = OrigIndVar->getIncomingValue(0);
-  Value *RecValue = OrigIndVar->getIncomingValue(1);
-  if (InitValue != IVD.getStartValue())
-    std::swap(InitValue, RecValue);
-
-  // If the only user of OrigIndVar is the one that produces RecValue, then we
-  // can safely remove it.
-  if (!OrigIndVar->hasOneUse() || OrigIndVar->user_back() != RecValue)
-    return;
-
-  LLVM_DEBUG(dbgs() << "Removed the original IndVar " << *OrigIndVar << "\n");
-  // Turn OrigIndVar into dead code by replacing all its uses by the initial
-  // value of this loop.
-  OrigIndVar->replaceAllUsesWith(InitValue);
-  OrigIndVar->eraseFromParent();
-}
-
 bool EVLIndVarSimplifyImpl::run(Loop &L) {
   if (!EnableEVLIndVarSimplify)
     return false;
@@ -262,7 +241,14 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   auto *NewPred = Builder.CreateICmp(Pred, EVLIndVar, TC);
   OrigLatchCmp->replaceAllUsesWith(NewPred);
 
-  tryCleanupOriginalIndVar(IndVar, IVD);
+  // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
+  // not used outside the cycles. However, in this case the now-RAUW-ed
+  // OrigLatchCmp will be consied a use outside the cycle while in reality it's
+  // practically dead. Thus we need to remove it before calling
+  // RecursivelyDeleteDeadPHINode.
+  (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp);
+  if (llvm::RecursivelyDeleteDeadPHINode(IndVar))
+    LLVM_DEBUG(dbgs() << "Removed original IndVar\n");
 
   ++NumEliminatedCanonicalIV;
 
diff --git a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
index 0e319b0be9684b..a47f2be1f4c62c 100644
--- a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
@@ -20,8 +20,6 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -37,9 +35,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 0, [[TMP10]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -166,8 +162,6 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nsw i64 [[N]], 15
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -179,9 +173,7 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[LSR_IV_NEXT33:%.*]] = add i64 [[N_VEC]], -16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[LSR_IV_NEXT33]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -244,8 +236,6 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nsw i64 87, 15
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -257,9 +247,7 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[LSR_IV_NEXT33:%.*]] = add i64 [[N_VEC]], -16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[LSR_IV_NEXT33]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]

>From 8259007eaae86d8c8ec8258ff6a4483874ddeb49 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 15 Jul 2024 13:49:46 -0700
Subject: [PATCH 08/11] Use a better exit condition that is derived from the
 original loop

The new exit condition will not be `EVLIV > VF * BTC` where VF is the
vectorization factor and BTC being the backedge taken count.
---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp     | 53 +++++++---------
 llvm/test/CodeGen/RISCV/evl-iv-simplify.ll | 70 +++++++++++++++++-----
 2 files changed, 77 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index e97fc578813ae5..0070675dc6e948 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
@@ -28,6 +29,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "evl-iv-simplify"
 
@@ -134,8 +136,6 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   }
   Value *CanonicalIVInit = &Bounds->getInitialIVValue();
   Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
-  const SCEV *CanonicalIVInitV = SE.getSCEV(CanonicalIVInit);
-  const SCEV *CanonicalIVFinalV = SE.getSCEV(CanonicalIVFinal);
 
   const SCEV *StepV = IVD.getStep();
   uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
@@ -152,7 +152,7 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   BasicBlock *BB = IndVar->getParent();
 
   Value *EVLIndVar = nullptr;
-  Value *RemTC = nullptr, *TC = nullptr;
+  Value *RemTC = nullptr;
   auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
       m_Value(RemTC), m_SpecificInt(VF),
       /*Scalable=*/m_SpecificInt(1));
@@ -192,53 +192,42 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
     LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
                       << "\n");
 
-    // Check 3: Pattern match to find the EVL-based index and total trip count
-    // (TC).
+    // Check 3: Pattern match to find the EVL-based index.
     if (match(RecValue,
               m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
-        match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
+        match(RemTC, m_Sub(m_Value(), m_Specific(&PN)))) {
       EVLIndVar = RecValue;
       break;
     }
   }
 
-  if (!EVLIndVar || !TC)
+  if (!EVLIndVar)
     return false;
 
-  // Make sure TC is related to the original trip count of the canonical IV.
-  // Specifically, if the canonical trip count is derived from TC.
-  const SCEV *TCV = SE.getSCEV(TC);
-  bool MatchTC = false;
-  if (const auto *ConstTCV = dyn_cast<SCEVConstant>(TCV)) {
-    // If TC is a constant and vscale is also a constant, then the canonical
-    // trip count will be constant. Canonical trip count * Step equals to the
-    // round up of TC.
-    if (const auto *ConstStep = dyn_cast<SCEVConstant>(StepV))
-      if (unsigned CanonicalTC = SE.getSmallConstantTripCount(&L)) {
-        APInt Step = ConstStep->getAPInt().abs().zextOrTrunc(64);
-        APInt CanonicalTripCount(64, CanonicalTC);
-        APInt TripCount = ConstTCV->getAPInt().zextOrTrunc(64);
-        MatchTC = (CanonicalTripCount * Step - TripCount).ult(Step);
-      }
-  }
-  // Otherwise, we simply check if the upper or lower bound expression of the
-  // canonical IV contains TC.
-  auto equalsTC = [&](const SCEV *S) -> bool { return S == TCV; };
-  if (!MatchTC && !llvm::SCEVExprContains(CanonicalIVFinalV, equalsTC) &&
-      !llvm::SCEVExprContains(CanonicalIVInitV, equalsTC))
+  const SCEV *BTC = SE.getBackedgeTakenCount(&L);
+  LLVM_DEBUG(dbgs() << "BTC: " << *BTC << "\n");
+  if (isa<SCEVCouldNotCompute>(BTC))
     return false;
 
-  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+  const SCEV *VFV = SE.getConstant(BTC->getType(), VF);
+  VFV = SE.getMulExpr(VFV, SE.getVScale(VFV->getType()));
+  const SCEV *ExitValV = SE.getMulExpr(BTC, VFV);
+  LLVM_DEBUG(dbgs() << "ExitVal: " << *ExitValV << "\n");
 
   // Create an EVL-based comparison and replace the branch to use it as
   // predicate.
   ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
-  ICmpInst::Predicate Pred = OrigLatchCmp->getPredicate();
-  if (!ICmpInst::isEquality(Pred))
+  const DataLayout &DL = L.getHeader()->getDataLayout();
+  SCEVExpander Expander(SE, DL, "evl.iv.exitcondition");
+  if (!Expander.isSafeToExpandAt(ExitValV, OrigLatchCmp))
     return false;
 
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+
+  Value *ExitVal =
+      Expander.expandCodeFor(ExitValV, EVLIndVar->getType(), OrigLatchCmp);
   IRBuilder<> Builder(OrigLatchCmp);
-  auto *NewPred = Builder.CreateICmp(Pred, EVLIndVar, TC);
+  auto *NewPred = Builder.CreateICmp(ICmpInst::ICMP_UGT, EVLIndVar, ExitVal);
   OrigLatchCmp->replaceAllUsesWith(NewPred);
 
   // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
diff --git a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
index a47f2be1f4c62c..85529cda965acd 100644
--- a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
 
-define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %N) {
+define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %N) vscale_range(2, 1024) {
 ; CHECK-LABEL: define void @simple(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -20,6 +20,16 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = udiv i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 4, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 0, [[TMP25]]
+; CHECK-NEXT:    [[TMP16:%.*]] = udiv i64 [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -35,8 +45,8 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -65,6 +75,20 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; LOOP-DEL:       vector.ph:
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; LOOP-DEL-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP18]], 1
+; LOOP-DEL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP19]]
+; LOOP-DEL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP20]], 4
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = udiv i64 [[N_RND_UP]], [[TMP8]]
+; LOOP-DEL-NEXT:    [[TMP21:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; LOOP-DEL-NEXT:    [[TMP22:%.*]] = sub i64 4, [[TMP21]]
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], [[TMP20]]
+; LOOP-DEL-NEXT:    [[TMP24:%.*]] = sub i64 0, [[TMP23]]
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[TMP8]]
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP25]], [[TMP20]]
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -80,8 +104,8 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
 ; LOOP-DEL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
-; LOOP-DEL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; LOOP-DEL-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP16]]
+; LOOP-DEL-NEXT:    br i1 [[TMP26]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LOOP-DEL:       for.body:
 ; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
@@ -129,7 +153,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> splat (i1 true), i32 %12)
   %21 = zext i32 %12 to i64
   %index.evl.next = add i64 %21, %evl.based.iv
-  %index.next = add i64 %index, %10
+  %index.next = add nuw i64 %index, %10
   %22 = icmp eq i64 %index.next, %n.vec
   br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
 
@@ -162,8 +186,15 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[N_VEC]], -16
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -173,8 +204,8 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -183,8 +214,15 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; LOOP-DEL-LABEL: define void @fixed_iv_step(
 ; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; LOOP-DEL-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = add i64 [[N_VEC]], -16
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP6]], [[TMP7]]
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 1
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -194,8 +232,8 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
-; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; LOOP-DEL-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; LOOP-DEL:       for.end:
 ; LOOP-DEL-NEXT:    ret void
 ;
@@ -238,6 +276,8 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP3]], 10
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -247,8 +287,8 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
-; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -259,6 +299,8 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; LOOP-DEL-NEXT:  entry:
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP3]], 10
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -268,8 +310,8 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
 ; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87
-; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; LOOP-DEL-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; LOOP-DEL:       for.end:
 ; LOOP-DEL-NEXT:    ret void
 ;

>From 8f04feb4e5e13b4bc77f07bd911288f2fe025a7a Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 11 Nov 2024 14:49:45 -0800
Subject: [PATCH 09/11] (Stash) Put EVLIVSimplify Pass at the end of the
 vectorizer Pass

In order to simplify the exit condition.
---
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp       | 44 ++++++++++----------
 llvm/lib/Passes/PassBuilderPipelines.cpp     |  2 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp |  4 --
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index 0070675dc6e948..2004b86c1284c3 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -112,6 +112,11 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
   if (!EnableEVLIndVarSimplify)
     return false;
 
+  BasicBlock *LatchBlock = L.getLoopLatch();
+  ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
+  if (!LatchBlock || !OrigLatchCmp)
+    return false;
+
   InductionDescriptor IVD;
   PHINode *IndVar = L.getInductionVariable(SE);
   if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
@@ -153,6 +158,7 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
 
   Value *EVLIndVar = nullptr;
   Value *RemTC = nullptr;
+  Value *TC = nullptr;
   auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
       m_Value(RemTC), m_SpecificInt(VF),
       /*Scalable=*/m_SpecificInt(1));
@@ -192,43 +198,37 @@ bool EVLIndVarSimplifyImpl::run(Loop &L) {
     LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
                       << "\n");
 
-    // Check 3: Pattern match to find the EVL-based index.
+    // Check 3: Pattern match to find the EVL-based index and total trip count
+    // (TC).
     if (match(RecValue,
               m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
-        match(RemTC, m_Sub(m_Value(), m_Specific(&PN)))) {
+        match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
       EVLIndVar = RecValue;
       break;
     }
   }
 
-  if (!EVLIndVar)
-    return false;
-
-  const SCEV *BTC = SE.getBackedgeTakenCount(&L);
-  LLVM_DEBUG(dbgs() << "BTC: " << *BTC << "\n");
-  if (isa<SCEVCouldNotCompute>(BTC))
+  if (!EVLIndVar || !TC)
     return false;
 
-  const SCEV *VFV = SE.getConstant(BTC->getType(), VF);
-  VFV = SE.getMulExpr(VFV, SE.getVScale(VFV->getType()));
-  const SCEV *ExitValV = SE.getMulExpr(BTC, VFV);
-  LLVM_DEBUG(dbgs() << "ExitVal: " << *ExitValV << "\n");
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
 
   // Create an EVL-based comparison and replace the branch to use it as
   // predicate.
-  ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
-  const DataLayout &DL = L.getHeader()->getDataLayout();
-  SCEVExpander Expander(SE, DL, "evl.iv.exitcondition");
-  if (!Expander.isSafeToExpandAt(ExitValV, OrigLatchCmp))
-    return false;
 
-  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+  // Loop::getLatchCmpInst check at the beginning of this function has ensured
+  // that latch block ends in a conditional branch.
+  auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator());
+  assert(LatchBranch->getNumSuccessors() == 2);
+  ICmpInst::Predicate Pred;
+  if (LatchBranch->getSuccessor(0) == L.getHeader())
+    Pred = ICmpInst::ICMP_ULT;
+  else
+    Pred = ICmpInst::ICMP_UGE;
 
-  Value *ExitVal =
-      Expander.expandCodeFor(ExitValV, EVLIndVar->getType(), OrigLatchCmp);
   IRBuilder<> Builder(OrigLatchCmp);
-  auto *NewPred = Builder.CreateICmp(ICmpInst::ICMP_UGT, EVLIndVar, ExitVal);
-  OrigLatchCmp->replaceAllUsesWith(NewPred);
+  auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC);
+  OrigLatchCmp->replaceAllUsesWith(NewLatchCmp);
 
   // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
   // not used outside the cycles. However, in this case the now-RAUW-ed
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3bb639ce45da93..bb399b4fd0c2eb 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1272,6 +1272,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass()));
+
   FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 24fa9a225fbca1..6a97755c279a29 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -19,7 +19,6 @@
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -467,9 +466,6 @@ void RISCVPassConfig::addIRPasses() {
   }
 
   TargetPassConfig::addIRPasses();
-
-  if (getOptLevel() != CodeGenOptLevel::None)
-    addPass(createEVLIndVarSimplifyPass());
 }
 
 bool RISCVPassConfig::addPreISel() {

>From 07d2c2caae27a013c47469dbef3192d57f45b2ed Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 23 Dec 2024 15:11:37 -0800
Subject: [PATCH 10/11] Update tests

---
 llvm/test/CodeGen/RISCV/O3-pipeline.ll     |  3 -
 llvm/test/CodeGen/RISCV/evl-iv-simplify.ll | 82 ++++++----------------
 2 files changed, 20 insertions(+), 65 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index cddd28043ef839..b0c756e26985bb 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -68,9 +68,6 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Scalar Evolution Analysis
-; CHECK-NEXT:       Loop Pass Manager
-; CHECK-NEXT:         EVL-based Induction Variables Simplify
 ; CHECK-NEXT:       Type Promotion
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
index 85529cda965acd..33674fd41ce83b 100644
--- a/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
@@ -20,16 +20,6 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = udiv i64 [[N_RND_UP]], [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 4, [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 0, [[TMP25]]
-; CHECK-NEXT:    [[TMP16:%.*]] = udiv i64 [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -38,15 +28,15 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP27]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -75,20 +65,6 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; LOOP-DEL:       vector.ph:
-; LOOP-DEL-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; LOOP-DEL-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
-; LOOP-DEL-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP18]], 1
-; LOOP-DEL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP19]]
-; LOOP-DEL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; LOOP-DEL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP20]], 4
-; LOOP-DEL-NEXT:    [[TMP9:%.*]] = udiv i64 [[N_RND_UP]], [[TMP8]]
-; LOOP-DEL-NEXT:    [[TMP21:%.*]] = shl nuw nsw i64 [[TMP9]], 2
-; LOOP-DEL-NEXT:    [[TMP22:%.*]] = sub i64 4, [[TMP21]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], [[TMP20]]
-; LOOP-DEL-NEXT:    [[TMP24:%.*]] = sub i64 0, [[TMP23]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[TMP8]]
-; LOOP-DEL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP25]], [[TMP20]]
-; LOOP-DEL-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -97,15 +73,15 @@ define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %
 ; LOOP-DEL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; LOOP-DEL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
 ; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; LOOP-DEL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; LOOP-DEL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
-; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP16]]
-; LOOP-DEL-NEXT:    br i1 [[TMP26]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LOOP-DEL:       for.body:
 ; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
@@ -186,26 +162,19 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[N_VEC]], -16
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -214,26 +183,19 @@ define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
 ; LOOP-DEL-LABEL: define void @fixed_iv_step(
 ; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; LOOP-DEL-NEXT:  entry:
-; LOOP-DEL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; LOOP-DEL-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; LOOP-DEL-NEXT:    [[TMP5:%.*]] = add i64 [[N_VEC]], -16
-; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
-; LOOP-DEL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; LOOP-DEL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP6]], [[TMP7]]
-; LOOP-DEL-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 1
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; LOOP-DEL-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
 ; LOOP-DEL-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
-; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
 ; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; LOOP-DEL-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
-; LOOP-DEL-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; LOOP-DEL:       for.end:
 ; LOOP-DEL-NEXT:    ret void
 ;
@@ -276,19 +238,17 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP3]], 10
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], 87
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; CHECK:       for.end.loopexit5:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -299,19 +259,17 @@ define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 {
 ; LOOP-DEL-NEXT:  entry:
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
 ; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-; LOOP-DEL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; LOOP-DEL-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP3]], 10
 ; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LOOP-DEL:       vector.body:
 ; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]]
 ; LOOP-DEL-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
 ; LOOP-DEL-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
-; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
 ; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
-; LOOP-DEL-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[INDEX_EVL_NEXT]], [[TMP4]]
-; LOOP-DEL-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp uge i64 [[INDEX_EVL_NEXT]], 87
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
 ; LOOP-DEL:       for.end:
 ; LOOP-DEL-NEXT:    ret void
 ;

>From 284f7332d1f4b274225395a49485a3e9d4d86641 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 23 Dec 2024 15:18:02 -0800
Subject: [PATCH 11/11] Removed legacy Pass

Since we're not adding to the codegen pipeline anymore
---
 llvm/include/llvm/CodeGen/EVLIndVarSimplify.h |  3 --
 llvm/include/llvm/InitializePasses.h          |  1 -
 llvm/lib/CodeGen/EVLIndVarSimplify.cpp        | 35 -------------------
 3 files changed, 39 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
index 88549d443b8e66..63e8b74f87d1cf 100644
--- a/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
+++ b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
@@ -20,7 +20,6 @@
 namespace llvm {
 class Loop;
 class LPMUpdater;
-class Pass;
 
 /// Turn vectorized loops with canonical induction variables into loops that
 /// only use a single EVL-based induction variable.
@@ -28,7 +27,5 @@ struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
-
-Pass *createEVLIndVarSimplifyPass();
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index de414917a95aa4..1cb9013bc48cc5 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -113,7 +113,6 @@ void initializeExpandReductionsPass(PassRegistry &);
 void initializeExpandVariadicsPass(PassRegistry &);
 void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeExternalAAWrapperPassPass(PassRegistry &);
-void initializeEVLIndVarSimplifyPass(PassRegistry &);
 void initializeFEntryInserterPass(PassRegistry &);
 void initializeFinalizeISelPass(PassRegistry &);
 void initializeFinalizeMachineBundlesPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index 2004b86c1284c3..c730b34ca5e370 100644
--- a/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -21,8 +21,6 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -54,21 +52,6 @@ struct EVLIndVarSimplifyImpl {
   // Returns true if modify the loop.
   bool run(Loop &L);
 };
-
-struct EVLIndVarSimplify : public LoopPass {
-  static char ID;
-
-  EVLIndVarSimplify() : LoopPass(ID) {
-    initializeEVLIndVarSimplifyPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.setPreservesCFG();
-  }
-};
 } // anonymous namespace
 
 static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
@@ -251,21 +234,3 @@ PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
     return PreservedAnalyses::allInSet<CFGAnalyses>();
   return PreservedAnalyses::all();
 }
-
-char EVLIndVarSimplify::ID = 0;
-
-INITIALIZE_PASS_BEGIN(EVLIndVarSimplify, DEBUG_TYPE,
-                      "EVL-based Induction Variables Simplify", false, false)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(EVLIndVarSimplify, DEBUG_TYPE,
-                    "EVL-based Induction Variables Simplify", false, false)
-
-bool EVLIndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipLoop(L))
-    return false;
-
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  return EVLIndVarSimplifyImpl(SE).run(*L);
-}
-
-Pass *llvm::createEVLIndVarSimplifyPass() { return new EVLIndVarSimplify(); }