[llvm] Add an all-in-one histogram intrinsic, along with lowering for AArch64 (PR #88106)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Tue May 7 01:42:33 PDT 2024
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/88106
>From 07812dbb9624c039076fee2c4cac553283dbbef3 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 20 Mar 2024 12:47:33 +0000
Subject: [PATCH] Add an all-in-one histogram intrinsic, along with lowering
for AArch64
Current interface is:
llvm.experimental.vector.histogram.op(<vecty> ptrs, <intty> inc_amount, <vecty> mask)
Where op is the update operation (currently limited to 'add').
The integer type used by 'inc_amount' needs to match the type of the buckets
in memory.
The intrinsic covers the following operations:
* Gather load
* histogram on the elements of 'ptrs'
* multiply the histogram results by 'inc_amount'
* add the result of the multiply to the values loaded by the gather
* scatter store the results of the add
These operations can obviously be scalarized on platforms without the
relevant instructions.
---
llvm/docs/LangRef.rst | 54 +++++++++
.../llvm/Analysis/TargetTransformInfo.h | 7 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +
llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 +
llvm/include/llvm/CodeGen/SelectionDAG.h | 3 +
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 33 +++++
llvm/include/llvm/IR/Intrinsics.td | 7 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 38 ++++++
.../SelectionDAG/SelectionDAGBuilder.cpp | 63 ++++++++++
.../SelectionDAG/SelectionDAGBuilder.h | 1 +
.../SelectionDAG/SelectionDAGDumper.cpp | 3 +
.../Target/AArch64/AArch64ISelLowering.cpp | 63 ++++++++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 +
.../Scalar/ScalarizeMaskedMemIntrin.cpp | 69 +++++++++++
.../AArch64/neon-scalarize-histogram.ll | 114 ++++++++++++++++++
llvm/test/CodeGen/AArch64/sve2-histcnt.ll | 53 ++++++++
17 files changed, 523 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2-histcnt.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ff0fc55860de9..d63be4d1777ee 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19068,6 +19068,60 @@ will be on any later loop iteration.
This intrinsic will only return 0 if the input count is also 0. A non-zero input
count will produce a non-zero result.
+'``llvm.experimental.vector.histogram.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These intrinsics are overloaded.
+
+These intrinsics represent histogram-like operations; that is, updating values
+in memory that may not be contiguous, and where multiple elements within a
+single vector may be updating the same value in memory.
+
+The update operation must be specified as part of the intrinsic name. For a
+simple histogram like the following the ``add`` operation would be used.
+
+.. code-block:: c
+
+ void simple_histogram(int *restrict buckets, unsigned *indices, int N, int inc) {
+ for (int i = 0; i < N; ++i)
+ buckets[indices[i]] += inc;
+ }
+
+More update operation types may be added in the future.
+
+::
+
+ declare <8 x i32> @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+ declare <vscale x 2 x i64> @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+
+Arguments:
+""""""""""
+
+The first argument is a vector of pointers to the memory locations to be
+updated. The second argument is a scalar used to update the value from
+memory; it must match the type of value to be updated. The final argument
+is a mask value to exclude locations from being modified.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.histogram.*``' intrinsics are used to perform
+updates on potentially overlapping values in memory. The intrinsics represent
+the follow sequence of operations:
+
+1. Gather load from the ``ptrs`` operand, with element type matching that of
+ the ``inc`` operand.
+2. Update of the values loaded from memory. In the case of the ``add``
+ update operation, this means:
+
+ 1. Perform a cross-vector histogram operation on the ``ptrs`` operand.
+ 2. Multiply the result by the ``inc`` operand.
+ 3. Add the result to the values loaded from memory
+3. Scatter the result of the update operation to the memory locations from
+ the ``ptrs`` operand.
+
+The ``mask`` operand will apply to at least the gather and scatter operations.
+
Matrix Intrinsics
-----------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1c76821fe5e4a..b3da4ba50fb2e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -797,6 +797,9 @@ class TargetTransformInfo {
/// Return true if the target supports strided load.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
+ // Return true if the target supports masked vector histograms.
+ bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const;
+
/// Return true if this is an alternating opcode pattern that can be lowered
/// to a single instruction on the target. In X86 this is for the addsub
/// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR.
@@ -1883,6 +1886,7 @@ class TargetTransformInfo::Concept {
virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0;
+ virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0;
virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1,
const SmallBitVector &OpcodeMask) const = 0;
@@ -2386,6 +2390,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override {
return Impl.isLegalStridedLoadStore(DataType, Alignment);
}
+ bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override {
+ return Impl.isLegalMaskedVectorHistogram(AddrType, DataType);
+ }
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask) const override {
return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4d5cd963e0926..32d8b0807b2ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -315,6 +315,10 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const {
+ return false;
+ }
+
bool enableOrderedReductions() const { return false; }
bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6429947958ee9..d8af97957e48e 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1402,6 +1402,11 @@ enum NodeType {
// which is later translated to an implicit use in the MIR.
CONVERGENCECTRL_GLUE,
+ // Experimental vector histogram intrinsic
+ // Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID
+ // Output: Output Chain
+ EXPERIMENTAL_VECTOR_HISTOGRAM,
+
/// BUILTIN_OP_END - This must be the last enum value in this list.
/// The target-specific pre-isel opcode values start here.
BUILTIN_OP_END
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4b1b58d4af0bb..71bf76c93e478 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1526,6 +1526,9 @@ class SelectionDAG {
ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
ISD::MemIndexType IndexType,
bool IsTruncating = false);
+ SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+ ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+ ISD::MemIndexType IndexType);
SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
MachineMemOperand *MMO);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index e7c7104145455..ac94c6099d080 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
friend class MaskedLoadStoreSDNode;
friend class MaskedGatherScatterSDNode;
friend class VPGatherScatterSDNode;
+ friend class MaskedHistogramSDNode;
uint16_t : NumMemSDNodeBits;
@@ -552,6 +553,7 @@ BEGIN_TWO_BYTE_PACK()
// MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
// VPGatherScatterSDNode => enum ISD::MemIndexType
// MaskedGatherScatterSDNode => enum ISD::MemIndexType
+ // MaskedHistogramSDNode => enum ISD::MemIndexType
uint16_t AddressingMode : 3;
};
enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 };
@@ -564,6 +566,7 @@ BEGIN_TWO_BYTE_PACK()
friend class MaskedLoadSDNode;
friend class MaskedGatherSDNode;
friend class VPGatherSDNode;
+ friend class MaskedHistogramSDNode;
uint16_t : NumLSBaseSDNodeBits;
@@ -1420,6 +1423,7 @@ class MemSDNode : public SDNode {
return getOperand(2);
case ISD::MGATHER:
case ISD::MSCATTER:
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return getOperand(3);
default:
return getOperand(1);
@@ -1468,6 +1472,7 @@ class MemSDNode : public SDNode {
case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
case ISD::GET_FPENV_MEM:
case ISD::SET_FPENV_MEM:
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return true;
default:
return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2953,6 +2958,34 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
}
};
+class MaskedHistogramSDNode : public MemSDNode {
+public:
+ friend class SelectionDAG;
+
+ MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+ EVT MemVT, MachineMemOperand *MMO,
+ ISD::MemIndexType IndexType)
+ : MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT,
+ MMO) {
+ LSBaseSDNodeBits.AddressingMode = IndexType;
+ }
+
+ ISD::MemIndexType getIndexType() const {
+ return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+ }
+
+ const SDValue &getBasePtr() const { return getOperand(3); }
+ const SDValue &getIndex() const { return getOperand(4); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getScale() const { return getOperand(5); }
+ const SDValue &getInc() const { return getOperand(1); }
+ const SDValue &getIntID() const { return getOperand(6); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM;
+ }
+};
+
class FPStateAccessSDNode : public MemSDNode {
public:
friend class SelectionDAG;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 28116e5316c96..926baac070284 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1855,6 +1855,13 @@ def int_experimental_vp_strided_load : DefaultAttrsIntrinsic<[llvm_anyvector_ty
llvm_i32_ty],
[ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
+// Experimental histogram
+def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Increment
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 33c899fe88999..c3b7ceba55ed4 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -513,6 +513,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
}
+bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType,
+ Type *DataType) const {
+ return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType);
+}
+
bool TargetTransformInfo::enableOrderedReductions() const {
return TTIImpl->enableOrderedReductions();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index eef5acd032345..296fbed0fefa3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9614,6 +9614,44 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
return V;
}
+SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT,
+ const SDLoc &dl, ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO,
+ ISD::MemIndexType IndexType) {
+ assert(Ops.size() == 7 && "Incompatible number of operands");
+
+ FoldingSetNodeID ID;
+ AddNodeIDNode(ID, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VTs, Ops);
+ ID.AddInteger(MemVT.getRawBits());
+ ID.AddInteger(getSyntheticNodeSubclassData<MaskedHistogramSDNode>(
+ dl.getIROrder(), VTs, MemVT, MMO, IndexType));
+ ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+ ID.AddInteger(MMO->getFlags());
+ void *IP = nullptr;
+ if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+ cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+ return SDValue(E, 0);
+ }
+
+ auto *N = newSDNode<MaskedHistogramSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+ VTs, MemVT, MMO, IndexType);
+ createOperands(N, Ops);
+
+ assert(N->getMask().getValueType().getVectorElementCount() ==
+ N->getIndex().getValueType().getVectorElementCount() &&
+ "Vector width mismatch between mask and data");
+ assert(isa<ConstantSDNode>(N->getScale()) &&
+ N->getScale()->getAsAPIntVal().isPowerOf2() &&
+ "Scale should be a constant power of 2");
+ assert(N->getInc().getValueType().isInteger() && "Non integer update value");
+
+ CSEMap.InsertNode(N, IP);
+ InsertNode(N);
+ SDValue V(N, 0);
+ NewSDValueDbgMsg(V, "Creating new node: ", this);
+ return V;
+}
+
SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
EVT MemVT, MachineMemOperand *MMO) {
assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cfd82a342433f..e320e72572d93 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6281,6 +6281,64 @@ void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I,
}
}
+void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
+ unsigned IntrinsicID) {
+ // For now, we're only lowering an 'add' histogram.
+ // We can add others later, e.g. saturating adds, min/max.
+ assert(IntrinsicID == Intrinsic::experimental_vector_histogram_add &&
+ "Tried to lower unsupported histogram type");
+ SDLoc sdl = getCurSDLoc();
+ Value *Ptr = I.getOperand(0);
+ SDValue Inc = getValue(I.getOperand(1));
+ SDValue Mask = getValue(I.getOperand(2));
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ DataLayout TargetDL = DAG.getDataLayout();
+ EVT VT = Inc.getValueType();
+ Align Alignment = DAG.getEVTAlign(VT);
+
+ const MDNode *Ranges = getRangeMetadata(I);
+
+ SDValue Root = DAG.getRoot();
+ SDValue Base;
+ SDValue Index;
+ ISD::MemIndexType IndexType;
+ SDValue Scale;
+ bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+ I.getParent(), VT.getScalarStoreSize());
+
+ unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo(AS),
+ MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+ MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+
+ if (!UniformBase) {
+ Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+ Index = getValue(Ptr);
+ IndexType = ISD::SIGNED_SCALED;
+ Scale =
+ DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+ }
+
+ EVT IdxVT = Index.getValueType();
+ EVT EltTy = IdxVT.getVectorElementType();
+ if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+ EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+ Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
+ }
+
+ SDValue ID = DAG.getTargetConstant(IntrinsicID, sdl, MVT::i32);
+
+ SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID};
+ SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl,
+ Ops, MMO, IndexType);
+
+ setValue(&I, Histogram);
+ DAG.setRoot(Histogram);
+}
+
/// Lower the call to the specified intrinsic function.
void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
unsigned Intrinsic) {
@@ -7949,6 +8007,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::experimental_convergence_entry:
case Intrinsic::experimental_convergence_loop:
visitConvergenceControl(I, Intrinsic);
+ return;
+ case Intrinsic::experimental_vector_histogram_add: {
+ visitVectorHistogram(I, Intrinsic);
+ return;
+ }
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 211e1653de560..ae361f8c500a0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -624,6 +624,7 @@ class SelectionDAGBuilder {
void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
+ void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
const SmallVectorImpl<SDValue> &OpValues);
void visitVPStore(const VPIntrinsic &VPIntrin,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 4ad4a938ca97f..59742e90c6791 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -529,6 +529,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::PATCHPOINT:
return "patchpoint";
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+ return "histogram";
+
// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
case ISD::SDID: \
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1ca78af5cda8..1878b8200ccbb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1606,6 +1606,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
}
+ // Histcnt is SVE2 only
+ if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
+ setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
+ Custom);
+
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -6730,6 +6735,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFunnelShift(Op, DAG);
case ISD::FLDEXP:
return LowerFLDEXP(Op, DAG);
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+ return LowerVECTOR_HISTOGRAM(Op, DAG);
}
}
@@ -27249,6 +27256,62 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
return DAG.getMergeValues({Lo, Hi}, DL);
}
+SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
+ SelectionDAG &DAG) const {
+ // FIXME: Maybe share some code with LowerMGather/Scatter?
+ MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
+ SDLoc DL(HG);
+ SDValue Chain = HG->getChain();
+ SDValue Inc = HG->getInc();
+ SDValue Mask = HG->getMask();
+ SDValue Ptr = HG->getBasePtr();
+ SDValue Index = HG->getIndex();
+ SDValue Scale = HG->getScale();
+ SDValue IntID = HG->getIntID();
+
+ // The Intrinsic ID determines the type of update operation.
+ ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
+ // Right now, we only support 'add' as an update.
+ assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
+ "Unexpected histogram update operation");
+
+ EVT IncVT = Inc.getValueType();
+ EVT IndexVT = Index.getValueType();
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
+ IndexVT.getVectorElementCount());
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
+ SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
+ SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
+
+ // Set the MMO to load only, rather than load|store.
+ MachineMemOperand *GMMO = HG->getMemOperand();
+ GMMO->setFlags(MachineMemOperand::MOLoad);
+ ISD::MemIndexType IndexType = HG->getIndexType();
+ SDValue Gather =
+ DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
+ GMMO, IndexType, ISD::NON_EXTLOAD);
+
+ SDValue GChain = Gather.getValue(1);
+
+ // Perform the histcnt, multiply by inc, add to bucket data.
+ SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
+ SDValue HistCnt =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
+
+ // Create a new MMO for the scatter.
+ MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
+ GMMO->getPointerInfo(), MachineMemOperand::MOStore, GMMO->getSize(),
+ GMMO->getAlign(), GMMO->getAAInfo());
+
+ SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
+ SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
+ ScatterOps, SMMO, IndexType, false);
+ return Scatter;
+}
+
SDValue
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fbdc4de5617fe..a02334079c92d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1149,6 +1149,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index a4111fad5d9f2..de80fa2c05023 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -862,6 +862,69 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
ModifiedDT = true;
}
+static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
+ DomTreeUpdater *DTU,
+ bool &ModifiedDT) {
+ // If we extend histogram to return a result someday (like the updated vector)
+ // then we'll need to support it here.
+ assert(CI->getType()->isVoidTy() && "Histogram with non-void return.");
+ Value *Ptrs = CI->getArgOperand(0);
+ Value *Inc = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+
+ auto *AddrType = cast<FixedVectorType>(Ptrs->getType());
+ Type *EltTy = Inc->getType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ Builder.SetInsertPoint(InsertPt);
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // FIXME: Do we need to add an alignment parameter to the intrinsic?
+ unsigned VectorWidth = AddrType->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ if (isConstantIntVector(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+ continue;
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
+ Value *Add = Builder.CreateAdd(Load, Inc);
+ Builder.CreateStore(Add, Ptr);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+
+ Instruction *ThenTerm =
+ SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DTU);
+
+ BasicBlock *CondBlock = ThenTerm->getParent();
+ CondBlock->setName("cond.histogram.update");
+
+ Builder.SetInsertPoint(CondBlock->getTerminator());
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+ LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
+ Value *Add = Builder.CreateAdd(Load, Inc);
+ Builder.CreateStore(Add, Ptr);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0);
+ NewIfBlock->setName("else");
+ Builder.SetInsertPoint(NewIfBlock, NewIfBlock->begin());
+ }
+
+ CI->eraseFromParent();
+ ModifiedDT = true;
+}
+
static bool runImpl(Function &F, const TargetTransformInfo &TTI,
DominatorTree *DT) {
std::optional<DomTreeUpdater> DTU;
@@ -938,6 +1001,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
switch (II->getIntrinsicID()) {
default:
break;
+ case Intrinsic::experimental_vector_histogram_add:
+ if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(),
+ CI->getArgOperand(1)->getType()))
+ return false;
+ scalarizeMaskedVectorHistogram(DL, CI, DTU, ModifiedDT);
+ break;
case Intrinsic::masked_load:
// Scalarize unsupported vector masked load
if (TTI.isLegalMaskedLoad(
diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
new file mode 100644
index 0000000000000..45f1429a810a0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+;; This test exercises the default lowering of the histogram to scalarized code.
+
+define void @histogram_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: tbnz w8, #0, .LBB0_3
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbnz w8, #0, .LBB0_4
+; CHECK-NEXT: .LBB0_2: // %else2
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_3: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: add x9, x9, x0
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbz w8, #0, .LBB0_2
+; CHECK-NEXT: .LBB0_4: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: add x9, x9, x0
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_i32_literal:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v2.2d, x0
+; CHECK-NEXT: sshll v3.2d, v0.2s, #2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT: tbz w8, #0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: add w9, w9, #1
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB1_2: // %else
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: tbz w8, #0, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: add w9, w9, #1
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB1_4: // %else2
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: tbnz w8, #0, .LBB1_7
+; CHECK-NEXT: // %bb.5: // %else4
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbnz w8, #0, .LBB1_8
+; CHECK-NEXT: .LBB1_6: // %else6
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_7: // %cond.histogram.update3
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: add w9, w9, #1
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbz w8, #0, .LBB1_6
+; CHECK-NEXT: .LBB1_8: // %cond.histogram.update5
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: add w9, w9, #1
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
+
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_i32_literal_alltruemask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2d, x0
+; CHECK-NEXT: sshll v2.2d, v0.2s, #2
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov x9, v2.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: ret
+
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
new file mode 100644
index 0000000000000..557a42116cdb0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT: mov z3.d, x0
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [z0.d]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d
+; CHECK-NEXT: st1d { z1.d }, p0, [z0.d]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt
+;; by 1, so we should be able to remove that and directly add the histcnt to the
+;; current bucket data.
+define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_literal:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_literal_noscale(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_literal_noscale:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+
+ %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
More information about the llvm-commits
mailing list