[llvm] Expanding the Histogram Intrinsic (PR #127399)

Tue Feb 18 02:33:12 PST 2025

https://github.com/RonDahan101 updated https://github.com/llvm/llvm-project/pull/127399

>From dae234d8cf81966d42b967c6e944ba215da34e0d Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Sun, 16 Feb 2025 15:40:30 +0200
Subject: [PATCH 1/3] Expanding the Histogram Intrinsic

Expanding the Histogram intrinsic to support more update options,
uadd.sat, umax, umin.
---
 llvm/docs/LangRef.rst                         |   3 +
 llvm/include/llvm/IR/Intrinsics.td            |  18 ++
 .../Vectorize/LoopVectorizationLegality.cpp   |  31 +--
 .../Transforms/Vectorize/LoopVectorize.cpp    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  85 +++++++-
 .../LoopVectorize/AArch64/sve2-histcnt.ll     | 199 ++++++++++++++++++
 7 files changed, 327 insertions(+), 24 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index deb87365ae8d7..59496ebb93cd7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20295,6 +20295,9 @@ More update operation types may be added in the future.
 
     declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
     declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
 
 Arguments:
 """"""""""
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 14ecae41ff08f..31a0ba2e6500d 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1947,6 +1947,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Increment
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Update value
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Update value
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
 // Experimental match
 def int_experimental_vector_match : DefaultAttrsIntrinsic<
                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce4..8325168342330 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1072,25 +1072,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
 /// Find histogram operations that match high-level code in loops:
 /// \code
-/// buckets[indices[i]]+=step;
+/// buckets[indices[i]] = UpdateOpeartor(buckets[indices[i]], Val);
 /// \endcode
+/// When updateOperator can be add, sub, add.sat, umin, umax, sub.
 ///
 /// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
-/// array the computed histogram. It uses a BinOp to sum all counts, storing
-/// them using a loop-variant index Load from the 'indices' input array.
+/// array the computed histogram. It uses a update instruction to update all
+/// counts, storing them using a loop-variant index Load from the 'indices'
+/// input array.
 ///
 /// On successful matches it updates the STATISTIC 'HistogramsDetected',
 /// regardless of hardware support. When there is support, it additionally
-/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+/// stores the UpdateOp/Load pairs in \p HistogramCounts, as well the pointers
 /// used to update histogram in \p HistogramPtrs.
 static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
                           const PredicatedScalarEvolution &PSE,
                           SmallVectorImpl<HistogramInfo> &Histograms) {
 
-  // Store value must come from a Binary Operation.
   Instruction *HPtrInstr = nullptr;
-  BinaryOperator *HBinOp = nullptr;
-  if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
+  Instruction *HInstr = nullptr;
+  if (!match(HSt, m_Store(m_Instruction(HInstr), m_Instruction(HPtrInstr))))
     return false;
 
   // BinOp must be an Add or a Sub modifying the bucket value by a
@@ -1098,8 +1099,14 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
   // FIXME: We assume the loop invariant term is on the RHS.
   //        Fine for an immediate/constant, but maybe not a generic value?
   Value *HIncVal = nullptr;
-  if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
-      !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
+  if (!match(HInstr, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HInstr, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HInstr, m_Intrinsic<Intrinsic::uadd_sat>(
+                         m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HInstr, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
+                                                  m_Value(HIncVal))) &&
+      !match(HInstr, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
+                                                  m_Value(HIncVal))))
     return false;
 
   // Make sure the increment value is loop invariant.
@@ -1141,15 +1148,15 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
 
   // Ensure we'll have the same mask by checking that all parts of the histogram
   // (gather load, update, scatter store) are in the same block.
-  LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
+  LoadInst *IndexedLoad = cast<LoadInst>(HInstr->getOperand(0));
   BasicBlock *LdBB = IndexedLoad->getParent();
-  if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
+  if (LdBB != HInstr->getParent() || LdBB != HSt->getParent())
     return false;
 
   LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
 
   // Store the operations that make up the histogram.
-  Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
+  Histograms.emplace_back(IndexedLoad, HInstr, HSt);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..9aec1fe9b570e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8658,14 +8658,16 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
                                      ArrayRef<VPValue *> Operands) {
   // FIXME: Support other operations.
   unsigned Opcode = HI->Update->getOpcode();
-  assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
-         "Histogram update operation must be an Add or Sub");
+  assert(VPHistogramRecipe::isLegalUpdateInstruction(HI->Update) &&
+         "Found Ilegal update instruction for histogram");
 
   SmallVector<VPValue *, 3> HGramOps;
   // Bucket address.
   HGramOps.push_back(Operands[1]);
   // Increment value.
   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
+  // Update Instruction.
+  HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update));
 
   // In case of predicated execution (due to tail-folding, or conditional
   // execution, or both), pass the relevant mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..38c0731c4f956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1466,9 +1466,16 @@ class VPHistogramRecipe : public VPRecipeBase {
   /// Return the mask operand if one was provided, or a null pointer if all
   /// lanes should be executed unconditionally.
   VPValue *getMask() const {
-    return getNumOperands() == 3 ? getOperand(2) : nullptr;
+    return getNumOperands() == 4 ? getOperand(3) : nullptr;
   }
 
+  /// Returns true if \p I is a legal update instruction of histogram operation.
+  static bool isLegalUpdateInstruction(Instruction *I);
+
+  /// Given update instruction \p I, returns the opcode of the coresponding
+  /// histogram instruction.
+  static unsigned getHistogramOpcode(Instruction *I);
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..c25de0e522277 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1223,6 +1223,7 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
 
   Value *Address = State.get(getOperand(0));
   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
+  Instruction *UpdateInst = cast<Instruction>(State.get(getOperand(2)));
   VectorType *VTy = cast<VectorType>(Address->getType());
 
   // The histogram intrinsic requires a mask even if the recipe doesn't;
@@ -1239,10 +1240,10 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
   // add a separate intrinsic in future, but for now we'll try this.
   if (Opcode == Instruction::Sub)
     IncAmt = Builder.CreateNeg(IncAmt);
-  else
-    assert(Opcode == Instruction::Add && "only add or sub supported for now");
+  assert(isLegalUpdateInstruction(UpdateInst) &&
+         "Found Ilegal update instruction for histogram");
 
-  State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
+  State.Builder.CreateIntrinsic(getHistogramOpcode(UpdateInst),
                                 {VTy, IncAmt->getType()},
                                 {Address, IncAmt, Mask});
 }
@@ -1277,24 +1278,51 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
                               Type::getVoidTy(Ctx.LLVMCtx),
                               {PtrTy, IncTy, MaskTy});
+  auto *UpdateInst = getOperand(2)->getUnderlyingValue();
+  InstructionCost UpdateCost;
+  if (isa<IntrinsicInst>(UpdateInst)) {
+    IntrinsicCostAttributes UpdateICA(Opcode, IncTy, {IncTy, IncTy});
+    UpdateCost = Ctx.TTI.getIntrinsicInstrCost(UpdateICA, Ctx.CostKind);
+  } else
+    UpdateCost = Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
 
   // Add the costs together with the add/sub operation.
   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
-         Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
+         UpdateCost;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
+  auto *UpdateInst = cast<Instruction>(getOperand(2)->getUnderlyingValue());
+  assert(isLegalUpdateInstruction(UpdateInst) &&
+         "Found Ilegal update instruction for histogram");
   O << Indent << "WIDEN-HISTOGRAM buckets: ";
   getOperand(0)->printAsOperand(O, SlotTracker);
 
-  if (Opcode == Instruction::Sub)
-    O << ", dec: ";
-  else {
-    assert(Opcode == Instruction::Add);
-    O << ", inc: ";
+  std::string UpdateMsg;
+  if (isa<BinaryOperator>(UpdateInst)) {
+    if (Opcode == Instruction::Sub)
+      UpdateMsg = ", dec: ";
+    else {
+      UpdateMsg = ", inc: ";
+    }
+  } else {
+    switch (cast<IntrinsicInst>(UpdateInst)->getIntrinsicID()) {
+    case Intrinsic::uadd_sat:
+      UpdateMsg = ", saturated inc: ";
+      break;
+    case Intrinsic::umax:
+      UpdateMsg = ", max: ";
+      break;
+    case Intrinsic::umin:
+      UpdateMsg = ", min: ";
+      break;
+    default:
+      llvm_unreachable("Found Ilegal update instruction for histogram");
+    }
   }
+  O << UpdateMsg;
   getOperand(1)->printAsOperand(O, SlotTracker);
 
   if (VPValue *Mask = getMask()) {
@@ -1303,6 +1331,45 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
+bool VPHistogramRecipe::isLegalUpdateInstruction(Instruction *I) {
+  // We only support add and sub instructions and the following list of
+  // intrinsics: uadd.sat, umax, umin.
+  if (isa<BinaryOperator>(I))
+    return I->getOpcode() == Instruction::Add ||
+           I->getOpcode() == Instruction::Sub;
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::uadd_sat:
+    case Intrinsic::umax:
+    case Intrinsic::umin:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+unsigned VPHistogramRecipe::getHistogramOpcode(Instruction *I) {
+  // We only support add and sub instructions and the following list of
+  // intrinsics: uadd.sat, umax, umin.
+  assert(isLegalUpdateInstruction(I) &&
+         "Found Ilegal update instruction for histogram");
+  if (isa<BinaryOperator>(I))
+    return Intrinsic::experimental_vector_histogram_add;
+  auto *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::uadd_sat:
+    return Intrinsic::experimental_vector_histogram_uadd_sat;
+  case Intrinsic::umax:
+    return Intrinsic::experimental_vector_histogram_umax;
+  case Intrinsic::umin:
+    return Intrinsic::experimental_vector_histogram_umin;
+  default:
+    llvm_unreachable("Found Ilegal update instruction for histogram");
+  }
+}
+
 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-SELECT ";
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 3b00312959d8a..eeffdad582ce2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -927,6 +927,205 @@ for.exit:
   ret void
 }
 
+define void @simple_histogram_uadd_sat(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_uadd_sat(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[L_BUCKET]], i32 1)
+; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = call i32 @llvm.uadd.sat.i32(i32 %l.bucket, i32 1)
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @simple_histogram_umax(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_umax(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 120, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = call i32 @llvm.umax.i32(i32 [[L_BUCKET]], i32 120)
+; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = call i32 @llvm.umax.i32(i32 %l.bucket, i32 120)
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+define void @simple_histogram_umin(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_umin(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 120, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = call i32 @llvm.umin.i32(i32 [[L_BUCKET]], i32 120)
+; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+  %l.idx = load i32, ptr %gep.indices, align 4
+  %idxprom1 = zext i32 %l.idx to i64
+  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+  %l.bucket = load i32, ptr %gep.bucket, align 4
+  %inc = call i32 @llvm.umin.i32(i32 %l.bucket, i32 120)
+  store i32 %inc, ptr %gep.bucket, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+  ret void
+}
+
+
 attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
 
 !0 = distinct !{!0, !1}

>From 95f68028915448c8c6945ba3061cd4c82d740c7a Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Tue, 18 Feb 2025 11:16:10 +0200
Subject: [PATCH 2/3] Fix review

---
 .../Vectorize/LoopVectorizationLegality.cpp   | 24 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ++++------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8325168342330..958b1115fe46d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1072,9 +1072,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
 /// Find histogram operations that match high-level code in loops:
 /// \code
-/// buckets[indices[i]] = UpdateOpeartor(buckets[indices[i]], Val);
+/// buckets[indices[i]] = UpdateOperator(buckets[indices[i]], Val);
 /// \endcode
-/// When updateOperator can be add, sub, add.sat, umin, umax, sub.
+/// When updateOperator can be add, sub, add.sat, umin, umax.
 ///
 /// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
 /// array the computed histogram. It uses a update instruction to update all
@@ -1090,8 +1090,8 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
                           SmallVectorImpl<HistogramInfo> &Histograms) {
 
   Instruction *HPtrInstr = nullptr;
-  Instruction *HInstr = nullptr;
-  if (!match(HSt, m_Store(m_Instruction(HInstr), m_Instruction(HPtrInstr))))
+  Instruction *HUpdateOp = nullptr;
+  if (!match(HSt, m_Store(m_Instruction(HUpdateOp), m_Instruction(HPtrInstr))))
     return false;
 
   // BinOp must be an Add or a Sub modifying the bucket value by a
@@ -1099,13 +1099,13 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
   // FIXME: We assume the loop invariant term is on the RHS.
   //        Fine for an immediate/constant, but maybe not a generic value?
   Value *HIncVal = nullptr;
-  if (!match(HInstr, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
-      !match(HInstr, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
-      !match(HInstr, m_Intrinsic<Intrinsic::uadd_sat>(
+  if (!match(HUpdateOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HUpdateOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+      !match(HUpdateOp, m_Intrinsic<Intrinsic::uadd_sat>(
                          m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
-      !match(HInstr, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
+      !match(HUpdateOp, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
                                                   m_Value(HIncVal))) &&
-      !match(HInstr, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
+      !match(HUpdateOp, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
                                                   m_Value(HIncVal))))
     return false;
 
@@ -1148,15 +1148,15 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
 
   // Ensure we'll have the same mask by checking that all parts of the histogram
   // (gather load, update, scatter store) are in the same block.
-  LoadInst *IndexedLoad = cast<LoadInst>(HInstr->getOperand(0));
+  LoadInst *IndexedLoad = cast<LoadInst>(HUpdateOp->getOperand(0));
   BasicBlock *LdBB = IndexedLoad->getParent();
-  if (LdBB != HInstr->getParent() || LdBB != HSt->getParent())
+  if (LdBB != HUpdateOp->getParent() || LdBB != HSt->getParent())
     return false;
 
   LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
 
   // Store the operations that make up the histogram.
-  Histograms.emplace_back(IndexedLoad, HInstr, HSt);
+  Histograms.emplace_back(IndexedLoad, HUpdateOp, HSt);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c25de0e522277..f9c017b6ee033 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1300,29 +1300,27 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "WIDEN-HISTOGRAM buckets: ";
   getOperand(0)->printAsOperand(O, SlotTracker);
 
-  std::string UpdateMsg;
   if (isa<BinaryOperator>(UpdateInst)) {
     if (Opcode == Instruction::Sub)
-      UpdateMsg = ", dec: ";
+      O << ", dec: ";
     else {
-      UpdateMsg = ", inc: ";
+      O << ", inc: ";
     }
   } else {
     switch (cast<IntrinsicInst>(UpdateInst)->getIntrinsicID()) {
     case Intrinsic::uadd_sat:
-      UpdateMsg = ", saturated inc: ";
+      O << ", saturated inc: ";
       break;
     case Intrinsic::umax:
-      UpdateMsg = ", max: ";
+      O << ", max: ";
       break;
     case Intrinsic::umin:
-      UpdateMsg = ", min: ";
+      O << ", min: ";
       break;
     default:
       llvm_unreachable("Found Ilegal update instruction for histogram");
     }
   }
-  O << UpdateMsg;
   getOperand(1)->printAsOperand(O, SlotTracker);
 
   if (VPValue *Mask = getMask()) {

>From 145a4ded450e373963baba6743f5e61836c2d564 Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Tue, 18 Feb 2025 12:32:23 +0200
Subject: [PATCH 3/3] Fix lit

---
 .../test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 55f82fd55daf4..e6503d0056930 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -78,6 +78,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
 ; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
 ; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
+; CHECK-NEXT:     WIDEN [[L_GEP_BUCKET:.*]] = load [[GEP_BUCKET]]
+; CHECK-NEXT:     WIDEN ir<%inc> = add nsw [[L_GEP_BUCKET]], ir<1>
 ; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
 ; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
 ; CHECK-NEXT:     EMIT branch-on-count [[IV_NEXT]], [[VTC]]