[llvm] Expanding the Histogram Intrinsic (PR #127399)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 02:33:12 PST 2025
https://github.com/RonDahan101 updated https://github.com/llvm/llvm-project/pull/127399
>From dae234d8cf81966d42b967c6e944ba215da34e0d Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Sun, 16 Feb 2025 15:40:30 +0200
Subject: [PATCH 1/3] Expanding the Histogram Intrinsic
Expanding the Histogram intrinsic to support more update options,
uadd.sat, umax, umin.
---
llvm/docs/LangRef.rst | 3 +
llvm/include/llvm/IR/Intrinsics.td | 18 ++
.../Vectorize/LoopVectorizationLegality.cpp | 31 +--
.../Transforms/Vectorize/LoopVectorize.cpp | 6 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 9 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 85 +++++++-
.../LoopVectorize/AArch64/sve2-histcnt.ll | 199 ++++++++++++++++++
7 files changed, 327 insertions(+), 24 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index deb87365ae8d7..59496ebb93cd7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20295,6 +20295,9 @@ More update operation types may be added in the future.
declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
Arguments:
""""""""""
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 14ecae41ff08f..31a0ba2e6500d 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1947,6 +1947,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
+def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Increment
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Update value
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Update value
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
// Experimental match
def int_experimental_vector_match : DefaultAttrsIntrinsic<
[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce4..8325168342330 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1072,25 +1072,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
/// Find histogram operations that match high-level code in loops:
/// \code
-/// buckets[indices[i]]+=step;
+/// buckets[indices[i]] = UpdateOpeartor(buckets[indices[i]], Val);
/// \endcode
+/// When updateOperator can be add, sub, add.sat, umin, umax, sub.
///
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
-/// array the computed histogram. It uses a BinOp to sum all counts, storing
-/// them using a loop-variant index Load from the 'indices' input array.
+/// array the computed histogram. It uses a update instruction to update all
+/// counts, storing them using a loop-variant index Load from the 'indices'
+/// input array.
///
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
/// regardless of hardware support. When there is support, it additionally
-/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
+/// stores the UpdateOp/Load pairs in \p HistogramCounts, as well the pointers
/// used to update histogram in \p HistogramPtrs.
static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
const PredicatedScalarEvolution &PSE,
SmallVectorImpl<HistogramInfo> &Histograms) {
- // Store value must come from a Binary Operation.
Instruction *HPtrInstr = nullptr;
- BinaryOperator *HBinOp = nullptr;
- if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
+ Instruction *HInstr = nullptr;
+ if (!match(HSt, m_Store(m_Instruction(HInstr), m_Instruction(HPtrInstr))))
return false;
// BinOp must be an Add or a Sub modifying the bucket value by a
@@ -1098,8 +1099,14 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
// FIXME: We assume the loop invariant term is on the RHS.
// Fine for an immediate/constant, but maybe not a generic value?
Value *HIncVal = nullptr;
- if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
- !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
+ if (!match(HInstr, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HInstr, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HInstr, m_Intrinsic<Intrinsic::uadd_sat>(
+ m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HInstr, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
+ m_Value(HIncVal))) &&
+ !match(HInstr, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
+ m_Value(HIncVal))))
return false;
// Make sure the increment value is loop invariant.
@@ -1141,15 +1148,15 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
// Ensure we'll have the same mask by checking that all parts of the histogram
// (gather load, update, scatter store) are in the same block.
- LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
+ LoadInst *IndexedLoad = cast<LoadInst>(HInstr->getOperand(0));
BasicBlock *LdBB = IndexedLoad->getParent();
- if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
+ if (LdBB != HInstr->getParent() || LdBB != HSt->getParent())
return false;
LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
// Store the operations that make up the histogram.
- Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
+ Histograms.emplace_back(IndexedLoad, HInstr, HSt);
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..9aec1fe9b570e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8658,14 +8658,16 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands) {
// FIXME: Support other operations.
unsigned Opcode = HI->Update->getOpcode();
- assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
- "Histogram update operation must be an Add or Sub");
+ assert(VPHistogramRecipe::isLegalUpdateInstruction(HI->Update) &&
+ "Found Ilegal update instruction for histogram");
SmallVector<VPValue *, 3> HGramOps;
// Bucket address.
HGramOps.push_back(Operands[1]);
// Increment value.
HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
+ // Update Instruction.
+ HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update));
// In case of predicated execution (due to tail-folding, or conditional
// execution, or both), pass the relevant mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..38c0731c4f956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1466,9 +1466,16 @@ class VPHistogramRecipe : public VPRecipeBase {
/// Return the mask operand if one was provided, or a null pointer if all
/// lanes should be executed unconditionally.
VPValue *getMask() const {
- return getNumOperands() == 3 ? getOperand(2) : nullptr;
+ return getNumOperands() == 4 ? getOperand(3) : nullptr;
}
+ /// Returns true if \p I is a legal update instruction of histogram operation.
+ static bool isLegalUpdateInstruction(Instruction *I);
+
+ /// Given update instruction \p I, returns the opcode of the coresponding
+ /// histogram instruction.
+ static unsigned getHistogramOpcode(Instruction *I);
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..c25de0e522277 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1223,6 +1223,7 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
Value *Address = State.get(getOperand(0));
Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
+ Instruction *UpdateInst = cast<Instruction>(State.get(getOperand(2)));
VectorType *VTy = cast<VectorType>(Address->getType());
// The histogram intrinsic requires a mask even if the recipe doesn't;
@@ -1239,10 +1240,10 @@ void VPHistogramRecipe::execute(VPTransformState &State) {
// add a separate intrinsic in future, but for now we'll try this.
if (Opcode == Instruction::Sub)
IncAmt = Builder.CreateNeg(IncAmt);
- else
- assert(Opcode == Instruction::Add && "only add or sub supported for now");
+ assert(isLegalUpdateInstruction(UpdateInst) &&
+ "Found Ilegal update instruction for histogram");
- State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
+ State.Builder.CreateIntrinsic(getHistogramOpcode(UpdateInst),
{VTy, IncAmt->getType()},
{Address, IncAmt, Mask});
}
@@ -1277,24 +1278,51 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
Type::getVoidTy(Ctx.LLVMCtx),
{PtrTy, IncTy, MaskTy});
+ auto *UpdateInst = getOperand(2)->getUnderlyingValue();
+ InstructionCost UpdateCost;
+ if (isa<IntrinsicInst>(UpdateInst)) {
+ IntrinsicCostAttributes UpdateICA(Opcode, IncTy, {IncTy, IncTy});
+ UpdateCost = Ctx.TTI.getIntrinsicInstrCost(UpdateICA, Ctx.CostKind);
+ } else
+ UpdateCost = Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
// Add the costs together with the add/sub operation.
return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
- Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
+ UpdateCost;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ auto *UpdateInst = cast<Instruction>(getOperand(2)->getUnderlyingValue());
+ assert(isLegalUpdateInstruction(UpdateInst) &&
+ "Found Ilegal update instruction for histogram");
O << Indent << "WIDEN-HISTOGRAM buckets: ";
getOperand(0)->printAsOperand(O, SlotTracker);
- if (Opcode == Instruction::Sub)
- O << ", dec: ";
- else {
- assert(Opcode == Instruction::Add);
- O << ", inc: ";
+ std::string UpdateMsg;
+ if (isa<BinaryOperator>(UpdateInst)) {
+ if (Opcode == Instruction::Sub)
+ UpdateMsg = ", dec: ";
+ else {
+ UpdateMsg = ", inc: ";
+ }
+ } else {
+ switch (cast<IntrinsicInst>(UpdateInst)->getIntrinsicID()) {
+ case Intrinsic::uadd_sat:
+ UpdateMsg = ", saturated inc: ";
+ break;
+ case Intrinsic::umax:
+ UpdateMsg = ", max: ";
+ break;
+ case Intrinsic::umin:
+ UpdateMsg = ", min: ";
+ break;
+ default:
+ llvm_unreachable("Found Ilegal update instruction for histogram");
+ }
}
+ O << UpdateMsg;
getOperand(1)->printAsOperand(O, SlotTracker);
if (VPValue *Mask = getMask()) {
@@ -1303,6 +1331,45 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
}
}
+bool VPHistogramRecipe::isLegalUpdateInstruction(Instruction *I) {
+ // We only support add and sub instructions and the following list of
+ // intrinsics: uadd.sat, umax, umin.
+ if (isa<BinaryOperator>(I))
+ return I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::Sub;
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::uadd_sat:
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+unsigned VPHistogramRecipe::getHistogramOpcode(Instruction *I) {
+ // We only support add and sub instructions and the following list of
+ // intrinsics: uadd.sat, umax, umin.
+ assert(isLegalUpdateInstruction(I) &&
+ "Found Ilegal update instruction for histogram");
+ if (isa<BinaryOperator>(I))
+ return Intrinsic::experimental_vector_histogram_add;
+ auto *II = cast<IntrinsicInst>(I);
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::uadd_sat:
+ return Intrinsic::experimental_vector_histogram_uadd_sat;
+ case Intrinsic::umax:
+ return Intrinsic::experimental_vector_histogram_umax;
+ case Intrinsic::umin:
+ return Intrinsic::experimental_vector_histogram_umin;
+ default:
+ llvm_unreachable("Found Ilegal update instruction for histogram");
+ }
+}
+
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 3b00312959d8a..eeffdad582ce2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -927,6 +927,205 @@ for.exit:
ret void
}
+define void @simple_histogram_uadd_sat(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_uadd_sat(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT: call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[INC:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[L_BUCKET]], i32 1)
+; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+ %l.idx = load i32, ptr %gep.indices, align 4
+ %idxprom1 = zext i32 %l.idx to i64
+ %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+ %l.bucket = load i32, ptr %gep.bucket, align 4
+ %inc = call i32 @llvm.uadd.sat.i32(i32 %l.bucket, i32 1)
+ store i32 %inc, ptr %gep.bucket, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, %N
+ br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+ ret void
+}
+
+define void @simple_histogram_umax(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_umax(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT: call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 120, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[INC:%.*]] = call i32 @llvm.umax.i32(i32 [[L_BUCKET]], i32 120)
+; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+ %l.idx = load i32, ptr %gep.indices, align 4
+ %idxprom1 = zext i32 %l.idx to i64
+ %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+ %l.bucket = load i32, ptr %gep.bucket, align 4
+ %inc = call i32 @llvm.umax.i32(i32 %l.bucket, i32 120)
+ store i32 %inc, ptr %gep.bucket, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, %N
+ br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+ ret void
+}
+
+define void @simple_histogram_umin(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
+; CHECK-LABEL: define void @simple_histogram_umin(
+; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT: call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 120, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
+; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
+; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[INC:%.*]] = call i32 @llvm.umin.i32(i32 [[L_BUCKET]], i32 120)
+; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
+ %l.idx = load i32, ptr %gep.indices, align 4
+ %idxprom1 = zext i32 %l.idx to i64
+ %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
+ %l.bucket = load i32, ptr %gep.bucket, align 4
+ %inc = call i32 @llvm.umin.i32(i32 %l.bucket, i32 120)
+ store i32 %inc, ptr %gep.bucket, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, %N
+ br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
+
+for.exit:
+ ret void
+}
+
+
attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
!0 = distinct !{!0, !1}
>From 95f68028915448c8c6945ba3061cd4c82d740c7a Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Tue, 18 Feb 2025 11:16:10 +0200
Subject: [PATCH 2/3] Fix review
---
.../Vectorize/LoopVectorizationLegality.cpp | 24 +++++++++----------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ++++------
2 files changed, 17 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8325168342330..958b1115fe46d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1072,9 +1072,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
/// Find histogram operations that match high-level code in loops:
/// \code
-/// buckets[indices[i]] = UpdateOpeartor(buckets[indices[i]], Val);
+/// buckets[indices[i]] = UpdateOperator(buckets[indices[i]], Val);
/// \endcode
-/// When updateOperator can be add, sub, add.sat, umin, umax, sub.
+/// When updateOperator can be add, sub, add.sat, umin, umax.
///
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
/// array the computed histogram. It uses a update instruction to update all
@@ -1090,8 +1090,8 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
SmallVectorImpl<HistogramInfo> &Histograms) {
Instruction *HPtrInstr = nullptr;
- Instruction *HInstr = nullptr;
- if (!match(HSt, m_Store(m_Instruction(HInstr), m_Instruction(HPtrInstr))))
+ Instruction *HUpdateOp = nullptr;
+ if (!match(HSt, m_Store(m_Instruction(HUpdateOp), m_Instruction(HPtrInstr))))
return false;
// BinOp must be an Add or a Sub modifying the bucket value by a
@@ -1099,13 +1099,13 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
// FIXME: We assume the loop invariant term is on the RHS.
// Fine for an immediate/constant, but maybe not a generic value?
Value *HIncVal = nullptr;
- if (!match(HInstr, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
- !match(HInstr, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
- !match(HInstr, m_Intrinsic<Intrinsic::uadd_sat>(
+ if (!match(HUpdateOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HUpdateOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
+ !match(HUpdateOp, m_Intrinsic<Intrinsic::uadd_sat>(
m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
- !match(HInstr, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
+ !match(HUpdateOp, m_Intrinsic<Intrinsic::umax>(m_Load(m_Specific(HPtrInstr)),
m_Value(HIncVal))) &&
- !match(HInstr, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
+ !match(HUpdateOp, m_Intrinsic<Intrinsic::umin>(m_Load(m_Specific(HPtrInstr)),
m_Value(HIncVal))))
return false;
@@ -1148,15 +1148,15 @@ static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
// Ensure we'll have the same mask by checking that all parts of the histogram
// (gather load, update, scatter store) are in the same block.
- LoadInst *IndexedLoad = cast<LoadInst>(HInstr->getOperand(0));
+ LoadInst *IndexedLoad = cast<LoadInst>(HUpdateOp->getOperand(0));
BasicBlock *LdBB = IndexedLoad->getParent();
- if (LdBB != HInstr->getParent() || LdBB != HSt->getParent())
+ if (LdBB != HUpdateOp->getParent() || LdBB != HSt->getParent())
return false;
LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
// Store the operations that make up the histogram.
- Histograms.emplace_back(IndexedLoad, HInstr, HSt);
+ Histograms.emplace_back(IndexedLoad, HUpdateOp, HSt);
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c25de0e522277..f9c017b6ee033 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1300,29 +1300,27 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN-HISTOGRAM buckets: ";
getOperand(0)->printAsOperand(O, SlotTracker);
- std::string UpdateMsg;
if (isa<BinaryOperator>(UpdateInst)) {
if (Opcode == Instruction::Sub)
- UpdateMsg = ", dec: ";
+ O << ", dec: ";
else {
- UpdateMsg = ", inc: ";
+ O << ", inc: ";
}
} else {
switch (cast<IntrinsicInst>(UpdateInst)->getIntrinsicID()) {
case Intrinsic::uadd_sat:
- UpdateMsg = ", saturated inc: ";
+ O << ", saturated inc: ";
break;
case Intrinsic::umax:
- UpdateMsg = ", max: ";
+ O << ", max: ";
break;
case Intrinsic::umin:
- UpdateMsg = ", min: ";
+ O << ", min: ";
break;
default:
llvm_unreachable("Found Ilegal update instruction for histogram");
}
}
- O << UpdateMsg;
getOperand(1)->printAsOperand(O, SlotTracker);
if (VPValue *Mask = getMask()) {
>From 145a4ded450e373963baba6743f5e61836c2d564 Mon Sep 17 00:00:00 2001
From: rond <ron.dahan at mobileye.com>
Date: Tue, 18 Feb 2025 12:32:23 +0200
Subject: [PATCH 3/3] Fix lit
---
.../test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 55f82fd55daf4..e6503d0056930 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -78,6 +78,8 @@ target triple = "aarch64-unknown-linux-gnu"
; CHECK-NEXT: WIDEN [[IDX:.*]] = load [[VECP_IDX]]
; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
; CHECK-NEXT: WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
+; CHECK-NEXT: WIDEN [[L_GEP_BUCKET:.*]] = load [[GEP_BUCKET]]
+; CHECK-NEXT: WIDEN ir<%inc> = add nsw [[L_GEP_BUCKET]], ir<1>
; CHECK-NEXT: WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
; CHECK-NEXT: EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
; CHECK-NEXT: EMIT branch-on-count [[IV_NEXT]], [[VTC]]
More information about the llvm-commits
mailing list