[llvm] [VPlan] Move auxiliary declarations out of VPlan.h (NFC). (PR #124104)

Thu Jan 23 12:50:52 PST 2025

================
@@ -0,0 +1,471 @@
+//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of different VPlan-related auxiliary
+/// helpers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
+
+#include "VPlanAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/InstructionCost.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+class IRBuilderBase;
+class LoopInfo;
+class SCEV;
+class Type;
+class VPBasicBlock;
+class VPRegionBlock;
+class VPlan;
+class Value;
+class LoopVersioning;
+
+/// Returns a calculation for the total number of elements for a given \p VF.
+/// For fixed width vectors this value is a constant, whereas for scalable
+/// vectors it is an expression determined at runtime.
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
+
+/// Return a value for Step multiplied by VF.
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
+                       int64_t Step);
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+///       we always assume predicated blocks have a 50% chance of executing.
+inline unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 16) = {1, 2, 4, 8}
+struct VFRange {
+  // A power of 2.
+  const ElementCount Start;
+
+  // A power of 2. If End <= Start range is empty.
+  ElementCount End;
+
+  bool isEmpty() const {
+    return End.getKnownMinValue() <= Start.getKnownMinValue();
+  }
+
+  VFRange(const ElementCount &Start, const ElementCount &End)
+      : Start(Start), End(End) {
+    assert(Start.isScalable() == End.isScalable() &&
+           "Both Start and End should have the same scalable flag");
+    assert(isPowerOf2_32(Start.getKnownMinValue()) &&
+           "Expected Start to be a power of 2");
+    assert(isPowerOf2_32(End.getKnownMinValue()) &&
+           "Expected End to be a power of 2");
+  }
+
+  /// Iterator to iterate over vectorization factors in a VFRange.
+  class iterator
+      : public iterator_facade_base<iterator, std::forward_iterator_tag,
+                                    ElementCount> {
+    ElementCount VF;
+
+  public:
+    iterator(ElementCount VF) : VF(VF) {}
+
+    bool operator==(const iterator &Other) const { return VF == Other.VF; }
+
+    ElementCount operator*() const { return VF; }
+
+    iterator &operator++() {
+      VF *= 2;
+      return *this;
+    }
+  };
+
+  iterator begin() { return iterator(Start); }
+  iterator end() {
+    assert(isPowerOf2_32(End.getKnownMinValue()));
+    return iterator(End);
+  }
+};
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPLane provides a way to access lanes in both fixed width and scalable
+/// vectors, where for the latter the lane index sometimes needs calculating
+/// as a runtime expression.
+class VPLane {
+public:
+  /// Kind describes how to interpret Lane.
+  enum class Kind : uint8_t {
+    /// For First, Lane is the index into the first N elements of a
+    /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
+    First,
+    /// For ScalableLast, Lane is the offset from the start of the last
+    /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
+    /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
+    /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
+    ScalableLast
+  };
+
+private:
+  /// in [0..VF)
+  unsigned Lane;
+
+  /// Indicates how the Lane should be interpreted, as described above.
+  Kind LaneKind;
+
+public:
+  VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {}
+  VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
+
+  static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
+
+  static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
+    assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
+           "trying to extract with invalid offset");
+    unsigned LaneOffset = VF.getKnownMinValue() - Offset;
+    Kind LaneKind;
+    if (VF.isScalable())
+      // In this case 'LaneOffset' refers to the offset from the start of the
+      // last subvector with VF.getKnownMinValue() elements.
+      LaneKind = VPLane::Kind::ScalableLast;
+    else
+      LaneKind = VPLane::Kind::First;
+    return VPLane(LaneOffset, LaneKind);
+  }
+
+  static VPLane getLastLaneForVF(const ElementCount &VF) {
+    return getLaneFromEnd(VF, 1);
+  }
+
+  /// Returns a compile-time known value for the lane index and asserts if the
+  /// lane can only be calculated at runtime.
+  unsigned getKnownLane() const {
+    assert(LaneKind == Kind::First);
+    return Lane;
+  }
+
+  /// Returns an expression describing the lane index that can be used at
+  /// runtime.
+  Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
+
+  /// Returns the Kind of lane offset.
+  Kind getKind() const { return LaneKind; }
+
+  /// Returns true if this is the first lane of the whole vector.
+  bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
+
+  /// Maps the lane to a cache index based on \p VF.
+  unsigned mapToCacheIndex(const ElementCount &VF) const {
+    switch (LaneKind) {
+    case VPLane::Kind::ScalableLast:
+      assert(VF.isScalable() && Lane < VF.getKnownMinValue());
+      return VF.getKnownMinValue() + Lane;
+    default:
+      assert(Lane < VF.getKnownMinValue());
+      return Lane;
+    }
+  }
+
+  /// Returns the maxmimum number of lanes that we are able to consider
+  /// caching for \p VF.
+  static unsigned getNumCachedLanes(const ElementCount &VF) {
+    return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
+  }
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
+  VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
+                   LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
+                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                   Loop *CurrentParentLoop, Type *CanonicalIVTy);
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// The chosen Vectorization Factor of the loop being vectorized.
+  ElementCount VF;
+
+  /// Hold the index to generate specific scalar instructions. Null indicates
+  /// that all instances are to be generated, using either scalar or vector
+  /// instructions.
+  std::optional<VPLane> Lane;
+
+  struct DataState {
+    // Each value from the original loop, when vectorized, is represented by a
+    // vector value in the map.
+    DenseMap<VPValue *, Value *> VPV2Vector;
+
+    DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
+  } Data;
+
+  /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
+  /// is false, otherwise return the generated scalar. \See set.
+  Value *get(VPValue *Def, bool IsScalar = false);
+
+  /// Get the generated Value for a given VPValue and given Part and Lane.
+  Value *get(VPValue *Def, const VPLane &Lane);
+
+  bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); }
+
+  bool hasScalarValue(VPValue *Def, VPLane Lane) {
+    auto I = Data.VPV2Scalars.find(Def);
+    if (I == Data.VPV2Scalars.end())
+      return false;
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    return CacheIdx < I->second.size() && I->second[CacheIdx];
+  }
+
+  /// Set the generated vector Value for a given VPValue, if \p
+  /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
+  void set(VPValue *Def, Value *V, bool IsScalar = false) {
+    if (IsScalar) {
+      set(Def, V, VPLane(0));
+      return;
+    }
+    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+           "scalar values must be stored as (0, 0)");
+    Data.VPV2Vector[Def] = V;
+  }
+
+  /// Reset an existing vector value for \p Def and a given \p Part.
+  void reset(VPValue *Def, Value *V) {
+    assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
+    Data.VPV2Vector[Def] = V;
+  }
+
+  /// Set the generated scalar \p V for \p Def and the given \p Lane.
+  void set(VPValue *Def, Value *V, const VPLane &Lane) {
+    auto &Scalars = Data.VPV2Scalars[Def];
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    if (Scalars.size() <= CacheIdx)
+      Scalars.resize(CacheIdx + 1);
+    assert(!Scalars[CacheIdx] && "should overwrite existing value");
+    Scalars[CacheIdx] = V;
+  }
+
+  /// Reset an existing scalar value for \p Def and a given \p Lane.
+  void reset(VPValue *Def, Value *V, const VPLane &Lane) {
+    auto Iter = Data.VPV2Scalars.find(Def);
+    assert(Iter != Data.VPV2Scalars.end() &&
+           "need to overwrite existing value");
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    assert(CacheIdx < Iter->second.size() &&
+           "need to overwrite existing value");
+    Iter->second[CacheIdx] = V;
+  }
+
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Value *To, Instruction *From);
+
+  /// Set the debug location in the builder using the debug location \p DL.
+  void setDebugLocFrom(DebugLoc DL);
+
+  /// Construct the vector value of a scalarized value \p V one lane at a time.
+  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
+
+  /// Hold state information used when constructing the CFG of the output IR,
+  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+  struct CFGState {
+    /// The previous VPBasicBlock visited. Initially set to null.
+    VPBasicBlock *PrevVPBB = nullptr;
+
+    /// The previous IR BasicBlock created or used. Initially set to the new
+    /// header BasicBlock.
+    BasicBlock *PrevBB = nullptr;
+
+    /// The last IR BasicBlock in the output IR. Set to the exit block of the
+    /// vector loop.
+    BasicBlock *ExitBB = nullptr;
+
+    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+    /// of replication, maps the BasicBlock of the last replica created.
+    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+    /// Updater for the DominatorTree.
+    DomTreeUpdater DTU;
+
+    CFGState(DominatorTree *DT)
+        : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
+
+    /// Returns the BasicBlock* mapped to the pre-header of the loop region
+    /// containing \p R.
+    BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
+  } CFG;
+
+  /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+  LoopInfo *LI;
+
+  /// Hold a reference to the IRBuilder used to generate output IR code.
+  IRBuilderBase &Builder;
+
+  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+  InnerLoopVectorizer *ILV;
+
+  /// Pointer to the VPlan code is generated for.
+  VPlan *Plan;
+
+  /// The parent loop object for the current scope, or nullptr.
+  Loop *CurrentParentLoop = nullptr;
+
+  /// LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  LoopVersioning *LVer = nullptr;
----------------
fhahn wrote:

At the moment it is only set if memory checks are needed hence the pointer. Others like `VPlan` `ILV` & co could be references, but I'd prefer to update that separately and keep this PR mostly a plain move

https://github.com/llvm/llvm-project/pull/124104