[llvm] [LV] Autovectorization for the all-in-one histogram intrinsic (PR #91458)

via llvm-commits llvm-commits at lists.llvm.org
Wed May 8 03:51:08 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-ir

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>

This patch implements autovectorization support for the 'all-in-one' histogram intrinsic, which seems to have more support than the 'standalone' intrinsic. See https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788/ for an overview of the work and my notes on the tradeoffs between the two approaches.

This patch currently contains parts of https://github.com/llvm/llvm-project/pull/88106 in order to demonstrate it working, but once the patch lands this will be rebased on top of that.

A proof-of-concept for autovec for the 'standalone' intrinsic can be found here to compare the differences for LoopVectorize: https://github.com/llvm/llvm-project/pull/89366


---

Patch is 51.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91458.diff


27 Files Affected:

- (modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (+24-2) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+8) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+4) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+4) 
- (modified) llvm/include/llvm/CodeGen/ISDOpcodes.h (+5) 
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+3) 
- (modified) llvm/include/llvm/CodeGen/SelectionDAGNodes.h (+31) 
- (modified) llvm/include/llvm/IR/Intrinsics.td (+7) 
- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+17) 
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+134-8) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+38) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+63) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h (+1) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (+3) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+62) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+1) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+30) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+2) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+9) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+50-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+6) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+25) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+32) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1) 
- (added) llvm/test/CodeGen/AArch64/sve2-histcnt.ll (+37) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll (+82) 


``````````diff
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cf998e66ee486..e9b47640f6c26 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -198,7 +198,8 @@ class MemoryDepChecker {
   bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
                    const DenseMap<Value *, const SCEV *> &Strides,
                    const DenseMap<Value *, SmallVector<const Value *, 16>>
-                       &UnderlyingObjects);
+                       &UnderlyingObjects,
+                   const SmallPtrSetImpl<const Value *> &HistogramPtrs);
 
   /// No memory dependence was encountered that would inhibit
   /// vectorization.
@@ -330,7 +331,8 @@ class MemoryDepChecker {
   isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
               unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
               const DenseMap<Value *, SmallVector<const Value *, 16>>
-                  &UnderlyingObjects);
+                  &UnderlyingObjects,
+              const SmallPtrSetImpl<const Value *> &HistogramPtrs);
 
   /// Check whether the data dependence could prevent store-load
   /// forwarding.
@@ -394,6 +396,15 @@ struct PointerDiffInfo {
         NeedsFreeze(NeedsFreeze) {}
 };
 
+struct HistogramInfo {
+  Instruction *Load;
+  Instruction *Update;
+  Instruction *Store;
+
+  HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
+      : Load(Load), Update(Update), Store(Store) {}
+};
+
 /// Holds information about the memory runtime legality checks to verify
 /// that a group of pointers do not overlap.
 class RuntimePointerChecking {
@@ -612,6 +623,10 @@ class LoopAccessInfo {
   unsigned getNumStores() const { return NumStores; }
   unsigned getNumLoads() const { return NumLoads;}
 
+  const SmallVectorImpl<HistogramInfo> &getHistograms() const {
+    return Histograms;
+  }
+
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -724,6 +739,13 @@ class LoopAccessInfo {
   /// If an access has a symbolic strides, this maps the pointer value to
   /// the stride symbol.
   DenseMap<Value *, const SCEV *> SymbolicStrides;
+
+  /// Holds the load, update, and store instructions for all histogram-style
+  /// operations found in the loop.
+  SmallVector<HistogramInfo, 2> Histograms;
+
+  /// Storing Histogram Pointers
+  SmallPtrSet<const Value *, 2> HistogramPtrs;
 };
 
 /// Return the SCEV corresponding to a pointer with the symbolic stride
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1c76821fe5e4a..d104bca64ce39 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -982,6 +982,9 @@ class TargetTransformInfo {
   /// Return hardware support for population count.
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
+  /// Returns the cost of generating a vector histogram.
+  InstructionCost getHistogramCost(Type *Ty) const;
+
   /// Return true if the hardware has a fast square-root instruction.
   bool haveFastSqrt(Type *Ty) const;
 
@@ -1930,6 +1933,7 @@ class TargetTransformInfo::Concept {
                                               unsigned *Fast) = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual InstructionCost getHistogramCost(Type *Ty) = 0;
   virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
   virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
   virtual InstructionCost getFPOpCost(Type *Ty) = 0;
@@ -2490,6 +2494,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   }
   bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
 
+  InstructionCost getHistogramCost(Type *Ty) override {
+    return Impl.getHistogramCost(Ty);
+  }
+
   bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
     return Impl.isExpensiveToSpeculativelyExecute(I);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4d5cd963e0926..43fc2de02c85b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -412,6 +412,10 @@ class TargetTransformInfoImplBase {
 
   bool haveFastSqrt(Type *Ty) const { return false; }
 
+  InstructionCost getHistogramCost(Type *Ty) const {
+    return InstructionCost::getInvalid();
+  }
+
   bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }
 
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c6e90e57e46ed..52f80d58a5498 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -538,6 +538,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
   }
 
+  InstructionCost getHistogramCost(Type *Ty) {
+    return InstructionCost::getInvalid();
+  }
+
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
     return true;
   }
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6429947958ee9..c8aa9a81c6c0f 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1402,6 +1402,11 @@ enum NodeType {
   // which is later translated to an implicit use in the MIR.
   CONVERGENCECTRL_GLUE,
 
+  // Experimental vector histogram intrinsic
+  // Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID
+  // Output: Output Chain
+  EXPERIMENTAL_HISTOGRAM,
+
   /// BUILTIN_OP_END - This must be the last enum value in this list.
   /// The target-specific pre-isel opcode values start here.
   BUILTIN_OP_END
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4b1b58d4af0bb..71bf76c93e478 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1526,6 +1526,9 @@ class SelectionDAG {
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                            ISD::MemIndexType IndexType,
                            bool IsTruncating = false);
+  SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+                             ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                             ISD::MemIndexType IndexType);
 
   SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
                       MachineMemOperand *MMO);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index e7c7104145455..ce1c012470dc4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
     friend class VPGatherScatterSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumMemSDNodeBits;
 
@@ -564,6 +565,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadSDNode;
     friend class MaskedGatherSDNode;
     friend class VPGatherSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -1420,6 +1422,7 @@ class MemSDNode : public SDNode {
       return getOperand(2);
     case ISD::MGATHER:
     case ISD::MSCATTER:
+    case ISD::EXPERIMENTAL_HISTOGRAM:
       return getOperand(3);
     default:
       return getOperand(1);
@@ -1468,6 +1471,7 @@ class MemSDNode : public SDNode {
     case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     case ISD::GET_FPENV_MEM:
     case ISD::SET_FPENV_MEM:
+    case ISD::EXPERIMENTAL_HISTOGRAM:
       return true;
     default:
       return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2953,6 +2957,33 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
   }
 };
 
+class MaskedHistogramSDNode : public MemSDNode {
+public:
+  friend class SelectionDAG;
+
+  MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+                        EVT MemVT, MachineMemOperand *MMO,
+                        ISD::MemIndexType IndexType)
+      : MemSDNode(ISD::EXPERIMENTAL_HISTOGRAM, Order, DL, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+  }
+
+  ISD::MemIndexType getIndexType() const {
+    return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+  }
+
+  const SDValue &getBasePtr() const { return getOperand(3); }
+  const SDValue &getIndex() const { return getOperand(4); }
+  const SDValue &getMask() const { return getOperand(2); }
+  const SDValue &getScale() const { return getOperand(5); }
+  const SDValue &getInc() const { return getOperand(1); }
+  const SDValue &getIntID() const { return getOperand(6); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::EXPERIMENTAL_HISTOGRAM;
+  }
+};
+
 class FPStateAccessSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 29143123193b9..138703bf3d98f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1856,6 +1856,13 @@ def int_experimental_vp_strided_load  : DefaultAttrsIntrinsic<[llvm_anyvector_ty
                                llvm_i32_ty],
                              [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
 
+// Experimental histogram
+def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Increment
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             []>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b..3ac73232c0f7b 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -387,6 +387,23 @@ class LoopVectorizationLegality {
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
+  bool isHistogramLoadOrUpdate(Instruction *I) const {
+    for (const HistogramInfo &HGram : LAI->getHistograms())
+      if (HGram.Load == I || HGram.Update == I)
+        return true;
+
+    return false;
+  }
+
+  std::optional<const HistogramInfo *>
+  getHistogramForStore(StoreInst *SI) const {
+    for (const HistogramInfo &HGram : LAI->getHistograms())
+      if (HGram.Store == SI)
+        return &HGram;
+
+    return std::nullopt;
+  }
+
   PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
     return &PSE;
   }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index ae7f0373c4e84..2b3a44fdc7d23 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
@@ -69,6 +70,8 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-accesses"
 
+STATISTIC(HistogramsDetected, "Number of Histograms detected");
+
 static cl::opt<unsigned, true>
 VectorizationFactor("force-vector-width", cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."),
@@ -730,6 +733,23 @@ class AccessAnalysis {
     return UnderlyingObjects;
   }
 
+  /// Find Histogram counts that match high-level code in loops:
+  /// \code
+  /// buckets[indices[i]]+=step;
+  /// \endcode
+  ///
+  /// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
+  /// array the computed histogram. It uses a BinOp to sum all counts, storing
+  /// them using a loop-variant index Load from the 'indices' input array.
+  ///
+  /// On successful matches it updates the STATISTIC 'HistogramsDetected',
+  /// regardless of hardware support. When there is support, it additionally
+  /// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+  /// used to update histogram in \p HistogramPtrs.
+  void findHistograms(StoreInst *HSt,
+                      SmallVectorImpl<HistogramInfo> &Histograms,
+                      SmallPtrSetImpl<const Value *> &HistogramPtrs);
+
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
@@ -1947,7 +1967,8 @@ getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
     const DenseMap<Value *, const SCEV *> &Strides,
     const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
-    PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
+    PredicatedScalarEvolution &PSE, const Loop *InnermostLoop,
+    const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
   auto &SE = *PSE.getSE();
   auto [APtr, AIsWrite] = A;
@@ -1965,6 +1986,15 @@ getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
+  // Ignore Histogram count updates as they are handled by the Intrinsic. This
+  // happens when the same pointer is first used to read from and then is used
+  // to write to.
+  if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) {
+    LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: "
+                      << *APtr);
+    return MemoryDepChecker::Dependence::NoDep;
+  }
+
   int64_t StrideAPtr =
       getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
   int64_t StrideBPtr =
@@ -2018,15 +2048,15 @@ getDependenceDistanceStrideAndSize(
 MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
     unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+    const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
+    const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
   assert(AIdx < BIdx && "Must pass arguments in program order");
 
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
   auto Res = getDependenceDistanceStrideAndSize(
       A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
-      InnermostLoop);
+      InnermostLoop, HistogramPtrs);
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
@@ -2240,8 +2270,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 bool MemoryDepChecker::areDepsSafe(
     DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
     const DenseMap<Value *, const SCEV *> &Strides,
-    const DenseMap<Value *, SmallVector<const Value *, 16>>
-        &UnderlyingObjects) {
+    const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
+    const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
 
   MinDepDistBytes = -1;
   SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2286,7 +2316,7 @@ bool MemoryDepChecker::areDepsSafe(
 
             Dependence::DepType Type =
                 isDependent(*A.first, A.second, *B.first, B.second, Strides,
-                            UnderlyingObjects);
+                            UnderlyingObjects, HistogramPtrs);
             mergeInStatus(Dependence::isSafeForVectorization(Type));
 
             // Gather dependences unless we accumulated MaxDependences
@@ -2622,6 +2652,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // check.
   Accesses.buildDependenceSets();
 
+  for (StoreInst *ST : Stores)
+    Accesses.findHistograms(ST, Histograms, HistogramPtrs);
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   Value *UncomputablePtr = nullptr;
@@ -2646,7 +2679,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
     CanVecMem = DepChecker->areDepsSafe(
         DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
-        Accesses.getUnderlyingObjects());
+        Accesses.getUnderlyingObjects(), HistogramPtrs);
 
     if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -3084,6 +3117,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
   return *I.first->second;
 }
 
+void AccessAnalysis::findHistograms(
+    StoreInst *HSt, SmallVectorImpl<HistogramInfo> &Histograms,
+    SmallPtrSetImpl<const Value *> &HistogramPtrs) {
+  LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt
+                    << "\n");
+  // Store value must come from a Binary Operation.
+  Instruction *HPtrInstr = nullptr;
+  BinaryOperator *HBinOp = nullptr;
+  if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) {
+    LLVM_DEBUG(dbgs() << "\tNo BinOp\n");
+    return;
+  }
+
+  // BinOp must be an Add or a Sub operating modifying the bucket value by a
+  // loop invariant amount.
+  // FIXME: We assume the loop invariant term is on the RHS.
+  //        Fine for an immediate/constant, but maybe not a generic value?
+  Value *HIncVal = nullptr;
+  if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) {
+    LLVM_DEBUG(dbgs() << "\tNo matching load\n");
+    return;
+  }
+  Instruction *IndexedLoad = cast<Instruction>(HBinOp->getOperand(0));
+
+  // The address to store is calculated through a GEP Instruction.
+  // FIXME: Support GEPs with more operands.
+  GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
+  if (!HPtr || HPtr->getNumOperands() > 2) {
+    LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n");
+    return;
+  }
+
+  // Check that the index is calculated by loading from another array. Ignore
+  // any extensions.
+  // FIXME: Support indices from other sources that a linear load from memory?
+  Value *HIdx = HPtr->getOperand(1);
+  Instruction *IdxInst = nullptr;
+  // FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to
+  //        look through extensions, find another way?
+  if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
+    return;
+
+  // Currently restricting this to linear addressing when loading indices.
+  LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
+  Value *VPtrVal;
+  if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) {
+    LLVM_DEBUG(dbgs() << "\tBad Index Load\n");
+    return;
+  }
+
+  if (!isa<SCEVAddRecExpr>(PSE.getSCEV(VPtrVal))) {
+    LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n");
+    return;
+  }
+
+  // FIXME: support smaller types of input arrays. Integers can be promoted
+  //        for codegen.
+  Type *VLoadTy = VLoad->getType();
+  if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 &&
+                                  VLoadTy->getScalarSizeInBits() != 64)) {
+    LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n");
+    return;
+  }
+
+  // Ensure we'll have the same mask by checking that all parts of the histogram
+  // are in the same block.
+  // FIXME: Could use dominance checks instead?
+  if (IndexedLoad->getParent() != HBi...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/91458


More information about the llvm-commits mailing list