[llvm] [LV] Introduce the EVLIVSimplify Pass for EVL-vectorized loops (PR #91796)

Thu May 16 11:27:13 PDT 2024

https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/91796

>From ffc2ad02411b8ef3c9409c9c1e43ef1be5f7a8a2 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Fri, 10 May 2024 11:27:40 -0700
Subject: [PATCH 1/3] [LV] Introduce the EVLIVSimplify Pass for EVL-vectorized
 loops

When we enable EVL-based loop vectorization w/ predicated tail-folding,
each vectorized loop has effectively two induction variables: one
calculates the step using (VF x vscale) and the other one increases the
IV by values returned from experiment.get.vector.length. The former,
also known as canonical IV, is more favorable for analyses as it's "countable"
in the sense of SCEV; the latter (EVL-based IV), however, is more favorable to
codegen, at least for those that support scalable vectors like AArch64 SVE and
RISC-V.

The idea is that we use canonical IV all the way until the beginning of codegen
pipeline, where we replace it with EVL-based IV using EVLIVSimplify
introduced here. Such that we can have the best from both worlds.

This Pass is enabled by default in RISC-V. However, since we haven't
really vectorize loops with predicate tail-folding, this Pass is
no-op at this moment.

That said, I have validate the correctness of this Pass by enable
EVL-based LV + predicated tail-folding
(i.e. -force-tail-folding-style=data-with-evl
-prefer-predicate-over-epilogue=predicate-dont-vectorize) and run on
SPEC2006INT and SPEC2017 intrate w/ test workload.
---
 llvm/include/llvm/InitializePasses.h          |   1 +
 .../Transforms/Vectorize/EVLIndVarSimplify.h  |  34 +++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   4 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Vectorize/EVLIndVarSimplify.cpp           | 253 ++++++++++++++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   3 +
 .../LoopVectorize/RISCV/evl-iv-simplify.ll    | 282 ++++++++++++++++++
 10 files changed, 581 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
 create mode 100644 llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9ba75d491c1c9..5e3eff127046d 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -130,6 +130,7 @@ void initializeIfConverterPass(PassRegistry&);
 void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeImplicitNullChecksPass(PassRegistry&);
 void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
+void initializeEVLIndVarSimplifyPass(PassRegistry &);
 void initializeInferAddressSpacesPass(PassRegistry&);
 void initializeInstSimplifyLegacyPassPass(PassRegistry &);
 void initializeInstructionCombiningPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
new file mode 100644
index 0000000000000..9b1c207439f8a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
@@ -0,0 +1,34 @@
+//===-------- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Loop;
+class LPMUpdater;
+class Pass;
+
+/// Turn vectorized loops with canonical induction variables into loops that
+/// only use a single EVL-based induction variable.
+struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+Pass *createEVLIndVarSimplifyPass();
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e4131706aba01..8b32bc623ecad 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -294,6 +294,7 @@
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1d7f0510450c9..ce11d023c85b4 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -135,6 +135,7 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index e5ce6cb7da649..683b7d2f64a42 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -602,6 +602,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch",
 #endif
 LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("dot-ddg", DDGDotPrinterPass())
+LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 7b2dcadc41917..88dca1fdaab8d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include <optional>
 using namespace llvm;
 
@@ -419,6 +420,9 @@ void RISCVPassConfig::addIRPasses() {
   }
 
   TargetPassConfig::addIRPasses();
+
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createEVLIndVarSimplifyPass());
 }
 
 bool RISCVPassConfig::addPreISel() {
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9674094024b9e..0bcc198086693 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_component_library(LLVMVectorize
+  EVLIndVarSimplify.cpp
   LoadStoreVectorizer.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
new file mode 100644
index 0000000000000..21c453925cd76
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -0,0 +1,253 @@
+//===------ EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+#define DEBUG_TYPE "evl-iv-simplify"
+
+using namespace llvm;
+
+STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
+
+namespace {
+struct EVLIndVarSimplifyImpl {
+  ScalarEvolution &SE;
+
+  explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
+      : SE(LAR.SE) {}
+
+  explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}
+
+  // Returns true if modify the loop.
+  bool run(Loop &L);
+};
+
+struct EVLIndVarSimplify : public LoopPass {
+  static char ID;
+
+  EVLIndVarSimplify() : LoopPass(ID) {
+    initializeEVLIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // anonymous namespace
+
+static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
+                                               const Function &F) {
+  if (!Step)
+    return std::nullopt;
+
+  // Looking for loops with IV step value in the form of `(<constant VF> x
+  // vscale)`.
+  if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
+    if (Mul->getNumOperands() == 2) {
+      const SCEV *LHS = Mul->getOperand(0);
+      const SCEV *RHS = Mul->getOperand(1);
+      if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
+        uint64_t V = Const->getAPInt().getLimitedValue();
+        if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
+          return static_cast<uint32_t>(V);
+      }
+    }
+  }
+
+  // If not, see if the vscale_range of the parent function is a fixed value,
+  // which makes the step value to be replaced by a constant.
+  if (isa<SCEVConstant>(Step) && F.hasFnAttribute(Attribute::VScaleRange)) {
+    APInt V = cast<SCEVConstant>(Step)->getAPInt().abs();
+    ConstantRange CR = llvm::getVScaleRange(&F, 64);
+    if (const APInt *Fixed = CR.getSingleElement()) {
+      V = V.zextOrTrunc(Fixed->getBitWidth());
+      uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+      if (VF && llvm::isUInt<32>(VF))
+        return static_cast<uint32_t>(VF);
+    }
+  }
+
+  return std::nullopt;
+}
+
+// Remove the original induction variable if it's not used anywhere.
+static void cleanupOriginalIndVar(PHINode *OrigIndVar, BasicBlock *InitBlock,
+                                  BasicBlock *BackEdgeBlock) {
+  Value *InitValue = OrigIndVar->getIncomingValueForBlock(InitBlock);
+  Value *RecValue = OrigIndVar->getIncomingValueForBlock(BackEdgeBlock);
+
+  // If the only user of OrigIndVar is the one produces RecValue, then we can
+  // safely remove it.
+  if (!OrigIndVar->hasOneUse() || OrigIndVar->user_back() != RecValue)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Removed the original IndVar " << *OrigIndVar << "\n");
+  // Remove OrigIndVar by replacing all its uses by the initial value of this
+  // loop. Then DCE will take care of the rest.
+  OrigIndVar->replaceAllUsesWith(InitValue);
+  OrigIndVar->eraseFromParent();
+}
+
+bool EVLIndVarSimplifyImpl::run(Loop &L) {
+  InductionDescriptor IVD;
+  PHINode *IndVar = L.getInductionVariable(SE);
+  if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
+    LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+
+  BasicBlock *InitBlock, *BackEdgeBlock;
+  if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
+    LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
+                      << L.getName() << "\n");
+    return false;
+  }
+
+  // Retrieve the loop bounds.
+  std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
+  if (!Bounds) {
+    LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+  Value *CanonicalIVInit = &Bounds->getInitialIVValue();
+  Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
+
+  const SCEV *StepV = IVD.getStep();
+  auto VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
+  if (!VF) {
+    LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
+                      << "'\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Using VF=" << *VF << " for loop " << L.getName()
+                    << "\n");
+
+  // Try to find the EVL-based induction variable.
+  using namespace PatternMatch;
+  BasicBlock *BB = IndVar->getParent();
+
+  Value *EVLIndex = nullptr;
+  Value *RemVL = nullptr, *AVL = nullptr;
+  auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
+      m_Value(RemVL), m_SpecificInt(*VF),
+      /*Scalable=*/m_SpecificInt(1));
+  for (auto &PN : BB->phis()) {
+    if (&PN == IndVar)
+      continue;
+
+    // Check 1: it has to contain both incoming (init) & backedge blocks
+    // from IndVar.
+    if (PN.getBasicBlockIndex(InitBlock) < 0 ||
+        PN.getBasicBlockIndex(BackEdgeBlock) < 0)
+      continue;
+    // Check 2: EVL index is always increasing, thus its inital value has to be
+    // equal to either the initial IV value (when the canonical IV is also
+    // increasing) or the last IV value (when canonical IV is decreasing).
+    Value *Init = PN.getIncomingValueForBlock(InitBlock);
+    using Direction = Loop::LoopBounds::Direction;
+    switch (Bounds->getDirection()) {
+    case Direction::Increasing:
+      if (Init != CanonicalIVInit)
+        continue;
+      break;
+    case Direction::Decreasing:
+      if (Init != CanonicalIVFinal)
+        continue;
+      break;
+    case Direction::Unknown:
+      // To be more permissive and see if either the initial or final IV value
+      // matches PN's init value.
+      if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
+        continue;
+      break;
+    }
+    Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
+    assert(RecValue);
+
+    LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
+                      << "\n");
+
+    // Check 3: Pattern match to find the EVL-based index and total trip count
+    // (AVL).
+    if (match(RecValue,
+              m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
+        match(RemVL, m_Sub(m_Value(AVL), m_Specific(&PN)))) {
+      EVLIndex = RecValue;
+      break;
+    }
+  }
+
+  if (!EVLIndex || !AVL)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndex << " for EVL-based IndVar\n");
+
+  // Create an EVL-based comparison and replace the branch to use it as
+  // predicate.
+  ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
+  ICmpInst::Predicate Pred = OrigLatchCmp->getPredicate();
+  if (!ICmpInst::isEquality(Pred))
+    return false;
+
+  IRBuilder<> Builder(OrigLatchCmp);
+  auto *NewPred = Builder.CreateICmp(Pred, EVLIndex, AVL);
+  OrigLatchCmp->replaceAllUsesWith(NewPred);
+
+  cleanupOriginalIndVar(IndVar, InitBlock, BackEdgeBlock);
+
+  ++NumEliminatedCanonicalIV;
+
+  return true;
+}
+
+PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
+                                             LoopStandardAnalysisResults &AR,
+                                             LPMUpdater &U) {
+  if (EVLIndVarSimplifyImpl(AR).run(L))
+    return PreservedAnalyses::allInSet<CFGAnalyses>();
+  return PreservedAnalyses::all();
+}
+
+char EVLIndVarSimplify::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EVLIndVarSimplify, DEBUG_TYPE,
+                      "EVL-based Induction Variables Simplify", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(EVLIndVarSimplify, DEBUG_TYPE,
+                    "EVL-based Induction Variables Simplify", false, false)
+
+bool EVLIndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  return EVLIndVarSimplifyImpl(SE).run(*L);
+}
+
+Pass *llvm::createEVLIndVarSimplifyPass() { return new EVLIndVarSimplify(); }
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 4a71d3276d263..4f4ba0efd8fe3 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -68,6 +68,9 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         EVL-based Induction Variables Simplify
 ; CHECK-NEXT:       Type Promotion
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
new file mode 100644
index 0000000000000..5db92fa7255f2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
+
+define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: define void @simple(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 0, [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @simple(
+; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
+; LOOP-DEL:       vector.ph:
+; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL:       vector.body:
+; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true)
+; LOOP-DEL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; LOOP-DEL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; LOOP-DEL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; LOOP-DEL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
+; LOOP-DEL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
+; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LOOP-DEL:       for.body:
+; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; LOOP-DEL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; LOOP-DEL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; LOOP-DEL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; LOOP-DEL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; LOOP-DEL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; LOOP-DEL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LOOP-DEL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; LOOP-DEL:       for.cond.cleanup:
+; LOOP-DEL-NEXT:    ret void
+;
+entry:
+  %0 = sub i64 -1, %N
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = mul i64 %1, 4
+  %3 = icmp ult i64 %0, %2
+  br i1 %3, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = mul i64 %4, 4
+  %6 = call i64 @llvm.vscale.i64()
+  %7 = mul i64 %6, 4
+  %8 = sub i64 %7, 1
+  %n.rnd.up = add i64 %N, %8
+  %n.mod.vf = urem i64 %n.rnd.up, %5
+  %n.vec = sub i64 %n.rnd.up, %n.mod.vf
+  %9 = call i64 @llvm.vscale.i64()
+  %10 = mul i64 %9, 4
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+  %11 = sub i64 %N, %evl.based.iv
+  %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true)
+  %13 = add i64 %evl.based.iv, 0
+  %14 = getelementptr inbounds i32, ptr %b, i64 %13
+  %15 = getelementptr inbounds i32, ptr %14, i32 0
+  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %16 = getelementptr inbounds i32, ptr %c, i64 %13
+  %17 = getelementptr inbounds i32, ptr %16, i32 0
+  %vp.op.load1 = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %17, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %18 = add nsw <vscale x 4 x i32> %vp.op.load1, %vp.op.load
+  %19 = getelementptr inbounds i32, ptr %a, i64 %13
+  %20 = getelementptr inbounds i32, ptr %19, i32 0
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  %21 = zext i32 %12 to i64
+  %index.evl.next = add i64 %21, %evl.based.iv
+  %index.next = add i64 %index, %10
+  %22 = icmp eq i64 %index.next, %n.vec
+  br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0
+
+middle.block:                                     ; preds = %vector.body
+  br i1 true, label %for.cond.cleanup, label %scalar.ph
+
+scalar.ph:                                        ; preds = %entry, %middle.block
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %scalar.ph
+  %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %23 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %24 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %24, %23
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+
+for.cond.cleanup:                                 ; preds = %middle.block, %for.body
+  ret void
+}
+
+; Fixed IV steps resulting from vscale_range with a single element
+
+define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 {
+; CHECK-LABEL: define void @fixed_iv_step(
+; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nsw i64 [[N]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; CHECK-NEXT:    [[LSR_IV_NEXT33:%.*]] = add i64 [[N_VEC]], -16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[LSR_IV_NEXT33]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; CHECK:       for.end.loopexit5:
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; LOOP-DEL-LABEL: define void @fixed_iv_step(
+; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; LOOP-DEL-NEXT:  entry:
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[ARG0]], i64 0
+; LOOP-DEL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; LOOP-DEL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; LOOP-DEL:       vector.body:
+; LOOP-DEL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true)
+; LOOP-DEL-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]]
+; LOOP-DEL-NEXT:    tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP1]])
+; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; LOOP-DEL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]]
+; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3]]
+; LOOP-DEL:       for.end:
+; LOOP-DEL-NEXT:    ret void
+;
+entry:
+  br label %vector.ph
+
+vector.ph:
+  %n.rnd.up = add nsw i64 %N, 15
+  %n.vec = and i64 %n.rnd.up, -16
+  %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %arg0, i64 0
+  %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ]
+  %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ]
+  %41 = sub i64 %N, %evl.based.iv
+  %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
+  %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
+  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %42)
+  %43 = zext i32 %42 to i64
+  %index.evl.next = add i64 %evl.based.iv, %43
+  %lsr.iv.next33 = add i64 %lsr.iv32, -16
+  %44 = icmp eq i64 %lsr.iv.next33, 0
+  br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3
+
+for.end.loopexit5:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg)
+
+declare <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr nocapture, <vscale x 4 x i1>, i32)
+
+declare void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, <vscale x 4 x i1>, i32)
+
+attributes #0 = { vscale_range(8,8) }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
+!2 = !{!"llvm.loop.unroll.runtime.disable"}
+!3 = distinct !{!3, !2, !1}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LOOP-DEL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 42df26efa115002cf4ee370e79eb189e9e870ff7 Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Mon, 13 May 2024 10:41:45 -0700
Subject: [PATCH 2/3] Address review comments

And simplify the test cases.
---
 .../Vectorize/EVLIndVarSimplify.cpp           | 19 ++++----
 .../LoopVectorize/RISCV/evl-iv-simplify.ll    | 46 ++++++-------------
 2 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
index 21c453925cd76..ba9a707dbea44 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
@@ -83,16 +83,17 @@ static std::optional<uint32_t> getVFFromIndVar(const SCEV *Step,
 
   // If not, see if the vscale_range of the parent function is a fixed value,
   // which makes the step value to be replaced by a constant.
-  if (isa<SCEVConstant>(Step) && F.hasFnAttribute(Attribute::VScaleRange)) {
-    APInt V = cast<SCEVConstant>(Step)->getAPInt().abs();
-    ConstantRange CR = llvm::getVScaleRange(&F, 64);
-    if (const APInt *Fixed = CR.getSingleElement()) {
-      V = V.zextOrTrunc(Fixed->getBitWidth());
-      uint64_t VF = V.udiv(*Fixed).getLimitedValue();
-      if (VF && llvm::isUInt<32>(VF))
-        return static_cast<uint32_t>(VF);
+  if (F.hasFnAttribute(Attribute::VScaleRange))
+    if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
+      APInt V = ConstStep->getAPInt().abs();
+      ConstantRange CR = llvm::getVScaleRange(&F, 64);
+      if (const APInt *Fixed = CR.getSingleElement()) {
+        V = V.zextOrTrunc(Fixed->getBitWidth());
+        uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+        if (VF && llvm::isUInt<32>(VF))
+          return static_cast<uint32_t>(VF);
+      }
     }
-  }
 
   return std::nullopt;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
index 5db92fa7255f2..72cb30f270f24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
@@ -2,9 +2,9 @@
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s
 ; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL
 
-define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+define void @simple(ptr noalias %a, ptr noalias %b, <vscale x 4 x i32> %c, i64 %N) {
 ; CHECK-LABEL: define void @simple(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
@@ -29,12 +29,9 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
@@ -52,10 +49,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]]
+; CHECK-NEXT:    [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -67,7 +61,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    ret void
 ;
 ; LOOP-DEL-LABEL: define void @simple(
-; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; LOOP-DEL-NEXT:  entry:
 ; LOOP-DEL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; LOOP-DEL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
@@ -82,12 +76,9 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; LOOP-DEL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true)
 ; LOOP-DEL-NEXT:    [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; LOOP-DEL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
-; LOOP-DEL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; LOOP-DEL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
-; LOOP-DEL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
-; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; LOOP-DEL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
-; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; LOOP-DEL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[C]], [[VP_OP_LOAD1]]
 ; LOOP-DEL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
 ; LOOP-DEL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP13]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP5]])
@@ -98,10 +89,7 @@ define void @simple(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; LOOP-DEL:       for.body:
 ; LOOP-DEL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; LOOP-DEL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; LOOP-DEL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; LOOP-DEL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; LOOP-DEL-NEXT:    [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; LOOP-DEL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; LOOP-DEL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; LOOP-DEL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -138,14 +126,11 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %13 = add i64 %evl.based.iv, 0
   %14 = getelementptr inbounds i32, ptr %b, i64 %13
   %15 = getelementptr inbounds i32, ptr %14, i32 0
-  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
-  %16 = getelementptr inbounds i32, ptr %c, i64 %13
-  %17 = getelementptr inbounds i32, ptr %16, i32 0
-  %vp.op.load1 = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %17, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
-  %18 = add nsw <vscale x 4 x i32> %vp.op.load1, %vp.op.load
+  %vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, <vscale x 4 x i1> splat (i1 true), i32 %12)
+  %18 = add nsw <vscale x 4 x i32> %c, %vp.op.load
   %19 = getelementptr inbounds i32, ptr %a, i64 %13
   %20 = getelementptr inbounds i32, ptr %19, i32 0
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %12)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %18, ptr align 4 %20, <vscale x 4 x i1> splat (i1 true), i32 %12)
   %21 = zext i32 %12 to i64
   %index.evl.next = add i64 %21, %evl.based.iv
   %index.next = add i64 %index, %10
@@ -163,11 +148,8 @@ for.body:                                         ; preds = %for.body, %scalar.p
   %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ]
   %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
   %23 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %24 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %24, %23
   %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx4, align 4
+  store i32 %23, ptr %arrayidx4, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3
@@ -241,7 +223,7 @@ vector.body:
   %41 = sub i64 %N, %evl.based.iv
   %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true)
   %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv
-  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %42)
+  tail call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> %broadcast.splat, ptr align 8 %gep, <vscale x 2 x i1> splat (i1 true), i32 %42)
   %43 = zext i32 %42 to i64
   %index.evl.next = add i64 %evl.based.iv, %43
   %lsr.iv.next33 = add i64 %lsr.iv32, -16

>From 219407fd155b5bb6b135261275b09c35c1b26e6d Mon Sep 17 00:00:00 2001
From: Min Hsu <min.hsu at sifive.com>
Date: Thu, 16 May 2024 11:09:46 -0700
Subject: [PATCH 3/3] Move EVLIndVarSimplify to CodeGen

---
 .../{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.h   | 6 +++---
 llvm/lib/CodeGen/CMakeLists.txt                             | 1 +
 .../{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.cpp | 2 +-
 llvm/lib/Passes/PassBuilder.cpp                             | 2 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp                    | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp                | 2 +-
 llvm/lib/Transforms/Vectorize/CMakeLists.txt                | 1 -
 .../LoopVectorize => CodeGen}/RISCV/evl-iv-simplify.ll      | 0
 8 files changed, 8 insertions(+), 8 deletions(-)
 rename llvm/include/llvm/{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.h (84%)
 rename llvm/lib/{Transforms/Vectorize => CodeGen}/EVLIndVarSimplify.cpp (99%)
 rename llvm/test/{Transforms/LoopVectorize => CodeGen}/RISCV/evl-iv-simplify.ll (100%)

diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
similarity index 84%
rename from llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
rename to llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
index 9b1c207439f8a..88549d443b8e6 100644
--- a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h
+++ b/llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
@@ -1,4 +1,4 @@
-//===-------- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV----===//
+//===- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV-*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
-#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
+#ifndef LLVM_CODEGEN_EVLINDVARSIMPLIFY_H
+#define LLVM_CODEGEN_EVLINDVARSIMPLIFY_H
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 77bf1b165d0cf..be08dcc1b2cfb 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_component_library(LLVMCodeGen
   EarlyIfConversion.cpp
   EdgeBundles.cpp
   EHContGuardCatchret.cpp
+  EVLIndVarSimplify.cpp
   ExecutionDomainFix.cpp
   ExpandLargeDivRem.cpp
   ExpandLargeFpConvert.cpp
diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
similarity index 99%
rename from llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
rename to llvm/lib/CodeGen/EVLIndVarSimplify.cpp
index ba9a707dbea44..b7a79be998d12 100644
--- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp
+++ b/llvm/lib/CodeGen/EVLIndVarSimplify.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8b32bc623ecad..2e6305620f56d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -79,6 +79,7 @@
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
@@ -294,7 +295,6 @@
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index ce11d023c85b4..ff9ff34637234 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
@@ -135,7 +136,6 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 88dca1fdaab8d..759cbea995b62 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -19,6 +19,7 @@
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -37,7 +38,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
 #include <optional>
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 0bcc198086693..9674094024b9e 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_component_library(LLVMVectorize
-  EVLIndVarSimplify.cpp
   LoadStoreVectorizer.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/CodeGen/RISCV/evl-iv-simplify.ll
similarity index 100%
rename from llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll
rename to llvm/test/CodeGen/RISCV/evl-iv-simplify.ll