[llvm] [LV] Introduce the EVLIVSimplify Pass for EVL-vectorized loops (PR #91796)

Wed Jun 5 14:30:07 PDT 2024

================
@@ -0,0 +1,296 @@
+//===------ EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes a vectorized loop with canonical IV to using EVL-based
+// IV if it was tail-folded by predicated EVL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/EVLIndVarSimplify.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+#define DEBUG_TYPE "evl-iv-simplify"
+
+using namespace llvm;
+
+STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
+
+static cl::opt<bool> EnableEVLIndVarSimplify(
+    "enable-evl-indvar-simplify",
+    cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden,
+    cl::init(true));
+
+namespace {
+struct EVLIndVarSimplifyImpl {
+  ScalarEvolution &SE;
+
+  explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
+      : SE(LAR.SE) {}
+
+  explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}
+
+  // Returns true if modify the loop.
+  bool run(Loop &L);
+};
+
+struct EVLIndVarSimplify : public LoopPass {
+  static char ID;
+
+  EVLIndVarSimplify() : LoopPass(ID) {
+    initializeEVLIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // anonymous namespace
+
+static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
+  if (!Step)
+    return 0U;
+
+  // Looking for loops with IV step value in the form of `(<constant VF> x
+  // vscale)`.
+  if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
+    if (Mul->getNumOperands() == 2) {
+      const SCEV *LHS = Mul->getOperand(0);
+      const SCEV *RHS = Mul->getOperand(1);
+      if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
+        uint64_t V = Const->getAPInt().getLimitedValue();
+        if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
+          return static_cast<uint32_t>(V);
+      }
+    }
+  }
+
+  // If not, see if the vscale_range of the parent function is a fixed value,
+  // which makes the step value to be replaced by a constant.
+  if (F.hasFnAttribute(Attribute::VScaleRange))
+    if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
+      APInt V = ConstStep->getAPInt().abs();
+      ConstantRange CR = llvm::getVScaleRange(&F, 64);
+      if (const APInt *Fixed = CR.getSingleElement()) {
+        V = V.zextOrTrunc(Fixed->getBitWidth());
+        uint64_t VF = V.udiv(*Fixed).getLimitedValue();
+        if (VF && llvm::isUInt<32>(VF) &&
+            // Make sure step is divisible by vscale.
+            V.urem(*Fixed).isZero())
+          return static_cast<uint32_t>(VF);
+      }
+    }
+
+  return 0U;
+}
+
+// Remove the original induction variable if it's not used anywhere.
+static void tryCleanupOriginalIndVar(PHINode *OrigIndVar,
+                                     const InductionDescriptor &IVD) {
+  if (OrigIndVar->getNumIncomingValues() != 2)
+    return;
+  Value *InitValue = OrigIndVar->getIncomingValue(0);
+  Value *RecValue = OrigIndVar->getIncomingValue(1);
+  if (InitValue != IVD.getStartValue())
+    std::swap(InitValue, RecValue);
+
+  // If the only user of OrigIndVar is the one that produces RecValue, then we
+  // can safely remove it.
+  if (!OrigIndVar->hasOneUse() || OrigIndVar->user_back() != RecValue)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Removed the original IndVar " << *OrigIndVar << "\n");
+  // Turn OrigIndVar into dead code by replacing all its uses by the initial
+  // value of this loop.
+  OrigIndVar->replaceAllUsesWith(InitValue);
+  OrigIndVar->eraseFromParent();
+}
+
+bool EVLIndVarSimplifyImpl::run(Loop &L) {
+  if (!EnableEVLIndVarSimplify)
+    return false;
+
+  InductionDescriptor IVD;
+  PHINode *IndVar = L.getInductionVariable(SE);
+  if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
+    LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+
+  BasicBlock *InitBlock, *BackEdgeBlock;
+  if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
+    LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
+                      << L.getName() << "\n");
+    return false;
+  }
+
+  // Retrieve the loop bounds.
+  std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
+  if (!Bounds) {
+    LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
+                      << "\n");
+    return false;
+  }
+  Value *CanonicalIVInit = &Bounds->getInitialIVValue();
+  Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
+  const SCEV *CanonicalIVInitV = SE.getSCEV(CanonicalIVInit);
+  const SCEV *CanonicalIVFinalV = SE.getSCEV(CanonicalIVFinal);
+
+  const SCEV *StepV = IVD.getStep();
+  uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
+  if (!VF) {
+    LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
+                      << "'\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
+                    << "\n");
+
+  // Try to find the EVL-based induction variable.
+  using namespace PatternMatch;
+  BasicBlock *BB = IndVar->getParent();
+
+  Value *EVLIndVar = nullptr;
+  Value *RemTC = nullptr, *TC = nullptr;
+  auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
+      m_Value(RemTC), m_SpecificInt(VF),
+      /*Scalable=*/m_SpecificInt(1));
+  for (auto &PN : BB->phis()) {
+    if (&PN == IndVar)
+      continue;
+
+    // Check 1: it has to contain both incoming (init) & backedge blocks
+    // from IndVar.
+    if (PN.getBasicBlockIndex(InitBlock) < 0 ||
+        PN.getBasicBlockIndex(BackEdgeBlock) < 0)
+      continue;
+    // Check 2: EVL index is always increasing, thus its inital value has to be
+    // equal to either the initial IV value (when the canonical IV is also
+    // increasing) or the last IV value (when canonical IV is decreasing).
+    Value *Init = PN.getIncomingValueForBlock(InitBlock);
+    using Direction = Loop::LoopBounds::Direction;
+    switch (Bounds->getDirection()) {
+    case Direction::Increasing:
+      if (Init != CanonicalIVInit)
+        continue;
+      break;
+    case Direction::Decreasing:
+      if (Init != CanonicalIVFinal)
+        continue;
+      break;
+    case Direction::Unknown:
+      // To be more permissive and see if either the initial or final IV value
+      // matches PN's init value.
+      if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
+        continue;
+      break;
+    }
+    Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
+    assert(RecValue);
+
+    LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
+                      << "\n");
+
+    // Check 3: Pattern match to find the EVL-based index and total trip count
+    // (TC).
+    if (match(RecValue,
+              m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
+        match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
+      EVLIndVar = RecValue;
+      break;
+    }
+  }
+
+  if (!EVLIndVar || !TC)
+    return false;
+
+  // Make sure TC is related to the original trip count of the canonical IV.
+  // Specifically, if the canonical trip count is derived from TC.
+  const SCEV *TCV = SE.getSCEV(TC);
+  bool MatchTC = false;
+  if (const auto *ConstTCV = dyn_cast<SCEVConstant>(TCV)) {
+    // If TC is a constant and vscale is also a constant, then the canonical
+    // trip count will be constant. Canonical trip count * Step equals to the
+    // round up of TC.
+    if (const auto *ConstStep = dyn_cast<SCEVConstant>(StepV))
+      if (unsigned CanonicalTC = SE.getSmallConstantTripCount(&L)) {
+        APInt Step = ConstStep->getAPInt().abs().zextOrTrunc(64);
+        APInt CanonicalTripCount(64, CanonicalTC);
+        APInt TripCount = ConstTCV->getAPInt().zextOrTrunc(64);
+        MatchTC = (CanonicalTripCount * Step - TripCount).ult(Step);
+      }
+  }
+  // Otherwise, we simply check if the upper or lower bound expression of the
+  // canonical IV contains TC.
+  auto equalsTC = [&](const SCEV *S) -> bool { return S == TCV; };
+  if (!MatchTC && !llvm::SCEVExprContains(CanonicalIVFinalV, equalsTC) &&
+      !llvm::SCEVExprContains(CanonicalIVInitV, equalsTC))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
+
+  // Create an EVL-based comparison and replace the branch to use it as
----------------
mshockwave wrote:

> See #94411
> 
> Though this is with two changes to your test:
> 
> * adding vscale_range to the function
> * making the IV nuw
> 
> I think both should apply to your original (i.e. your test is slightly over-reduced), but please confirm.

Thank you for the quick fix.
I gave it a try and found 1 minor and 1 major concern: the minor one is that `vscale_range` has to be set otherwise BTC cannot be computed. This is because when we're computing BTC,  one of the preconditions is that the range of `(VF x vscale)` has to be non-zero, which depends on the range of vscale. Currently we use `[1,0)` ,a.k.a wrap around starting from 1, to represent the range of vscale when `vscale_range` is not set -- but this is an _unsigned_ wrap around. However, when computing the multiplication range of `(VF x vscale)` this `[1,0)` is treated as _signed_ wrap around, leading to `(VF x vscale)`'s range being full_set. Therefore when being asked the minimum unsigned value of `(VF x vscale)`'s range, it returns zero. I call this "minor" because I believe in practice vscale_range will mostly be set. But it's still a little annoyed to be that we cannot represent explicit unsigned range like `[1, UINT_MAX]` with `ConstantRange`.

The major concern being that the instructions for computing exit value expanded from the SCEV expressions we crafted is pretty verbose, long, a more importantly, contain division instructions. Let me start with the formula we have:
```
IV = the original canonical IV
EVL_TC = the trip count in the scalar loop
IV.Step = (VF x vscale)
EVLStep = umin(IV.Step, sub_nsw(EVL_TC, IV))
BTC = backedge taken count in the original canonical IV loop
ExitValue = IV.evaluateAtIteration(BTC) + EVLStep.evaluateAtIteration(BTC)
```
In your original formula EVLStep is `min(VF, sub_nsw(EVL_TC, IV))`, which I had a question on whether the first operand of `min` is VF or `IV.Step`. Because to my best understandings, `VF` is the same VF we put in the scalable vector type we use, `(vscale x VF x ...)`, and also the VF used in computing step value in IV: `(VF x vscale)`. So I think we should use IV.Step here. For the very least, if I use `VF` (as in my definition) here, the result will be pretty wrong.

Now, since only SCEVAddRec has `evaluateAtIteration`, we can't really do `EVLStep.evaluateAtIteration`. Instead, I rewrote `EVLStep.evaluateAtIteration` into:
```
umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC))
```
Because IV.Step and EVL_TC are both loop invariant.
Using the `@simple` function in my test case + the same modifications you made in your comment above, as an example, this final `IV.evaluateAtIteration(BTC) + umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC))` looks like:
```
((4 * vscale * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)) + (((-4 * vscale * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)) + %N) umin (4 * vscale)<nuw><nsw>))
```
And those divisions seen in the expressions will be expanded into division instructions. I'm concern whether we should spend so many instructions to compute something that will almost certain be `EVL_TC`.

I also run the same algorithm on the `@fixed_iv_step_tc` function in my test case, which has fixed IV.Step and fixed trip count. The generated exit value is correct, which means my methodology is probably not terribly off.

I've also tried to use `sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC)`, which is the tail in the last iteration, in replacement of `umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC))`. But the resulting Exit value, `IV.evaluateAtIteration(BTC) + sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC)` is not really meaningful (it will certainly be `EVL_TC` though).

Last but not the least, I'm still a little confused about how to do Step (1) you described: it's true that I already did such a check -- but only on cases where both trip count and vscale are constant. For every other cases we only have SCEV expressions, which we cannot know it value during compile time. Even we expand the check into runtime check, what should we do if the check fails during runtime? Do we fall back the original canonical IV?

https://github.com/llvm/llvm-project/pull/91796