[llvm] Add an all-in-one histogram intrinsic, along with lowering for AArch64 (PR #88106)

via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 9 03:40:53 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-ir

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>

Based on discussion from https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788

Current interface is:

llvm.experimental.histogram(<vecty> ptrs, <intty> inc_amount, <vecty> mask)

The integer type used by 'inc_amount' needs to match the type of the buckets
in memory.

The intrinsic covers the following operations:
  * Gather load
  * histogram on the elements of 'ptrs'
  * multiply the histogram results by 'inc_amount'
  * add the result of the multiply to the values loaded by the gather
  * scatter store the results of the add

At this stage, we'd mostly like to discuss which type of intrinsic would be preferable (this one, or the original proposed in the RFC). There will be work on the LoopVectorize side to recognize histogram operations proceeding independently of this patch, at least for now.

---
Full diff: https://github.com/llvm/llvm-project/pull/88106.diff


9 Files Affected:

- (modified) llvm/include/llvm/CodeGen/ISDOpcodes.h (+5) 
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+3) 
- (modified) llvm/include/llvm/CodeGen/SelectionDAGNodes.h (+24) 
- (modified) llvm/include/llvm/IR/Intrinsics.td (+7) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+33) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+50) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (+2) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+54) 
- (added) llvm/test/CodeGen/AArch64/sve2-histcnt.ll (+37) 


``````````diff
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 49d51a27e3c0f6..eed80c4c57b2cd 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1395,6 +1395,11 @@ enum NodeType {
   // which is later translated to an implicit use in the MIR.
   CONVERGENCECTRL_GLUE,
 
+  // Experimental vector histogram intrinsic
+  // Operands: input chain, baseptr, indices, inc, mask
+  // Output: output chain
+  EXPERIMENTAL_HISTOGRAM,
+
   /// BUILTIN_OP_END - This must be the last enum value in this list.
   /// The target-specific pre-isel opcode values start here.
   BUILTIN_OP_END
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index f347131be080f6..3e3fef50bad9ad 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1525,6 +1525,9 @@ class SelectionDAG {
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                            ISD::MemIndexType IndexType,
                            bool IsTruncating = false);
+  SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+                             ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                             ISD::MemIndexType IndexType);
 
   SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
                       MachineMemOperand *MMO);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 261f7e49e5c8ca..f3ff552525dfb8 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
     friend class VPGatherScatterSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumMemSDNodeBits;
 
@@ -564,6 +565,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadSDNode;
     friend class MaskedGatherSDNode;
     friend class VPGatherSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -1411,6 +1413,7 @@ class MemSDNode : public SDNode {
       return getOperand(2);
     case ISD::MGATHER:
     case ISD::MSCATTER:
+    case ISD::EXPERIMENTAL_HISTOGRAM:
       return getOperand(3);
     default:
       return getOperand(1);
@@ -1459,6 +1462,7 @@ class MemSDNode : public SDNode {
     case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     case ISD::GET_FPENV_MEM:
     case ISD::SET_FPENV_MEM:
+    case ISD::EXPERIMENTAL_HISTOGRAM:
       return true;
     default:
       return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2939,6 +2943,26 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
   }
 };
 
+class MaskedHistogramSDNode : public MemSDNode {
+public:
+  friend class SelectionDAG;
+
+  MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+                        EVT MemVT, MachineMemOperand *MMO,
+                        ISD::MemIndexType IndexType)
+      : MemSDNode(ISD::EXPERIMENTAL_HISTOGRAM, Order, DL, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+  }
+
+  ISD::MemIndexType getIndexType() const {
+    return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+  }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::EXPERIMENTAL_HISTOGRAM;
+  }
+};
+
 class FPStateAccessSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f0723a633f0fc5..d4ee7fd1b862a9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1848,6 +1848,13 @@ def int_experimental_vp_strided_load  : DefaultAttrsIntrinsic<[llvm_anyvector_ty
                                llvm_i32_ty],
                              [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
 
+// Experimental histogram
+def int_experimental_histogram : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Increment
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrNoSync, IntrWillReturn ]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1dd0fa49a460f8..b7f80a06019407 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9530,6 +9530,39 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT,
+                                         const SDLoc &dl,
+                                         ArrayRef<SDValue> Ops,
+                                         MachineMemOperand *MMO,
+                                         ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 6 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_HISTOGRAM, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedHistogramSDNode>(
+      dl.getIROrder(), VTs, MemVT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<MaskedHistogramSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                             VTs, MemVT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  // FIXME: assert conditions on operands.
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
                                   EVT MemVT, MachineMemOperand *MMO) {
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4ba27157ec1c6e..235c79384f412f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7940,6 +7940,56 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:
     visitConvergenceControl(I, Intrinsic);
+    return;
+  case Intrinsic::experimental_histogram: {
+    // FIXME: Move this to a separate function.
+    Value *Ptr = I.getOperand(0);
+    SDValue Inc = getValue(I.getOperand(1));
+    SDValue Mask = getValue(I.getOperand(2));
+
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    DataLayout TargetDL = DAG.getDataLayout();
+    EVT VT = Inc.getValueType();
+    Align Alignment = DAG.getEVTAlign(VT);
+
+    const MDNode *Ranges = getRangeMetadata(I);
+
+    SDValue Root = DAG.getRoot();
+    SDValue Base;
+    SDValue Index;
+    ISD::MemIndexType IndexType;
+    SDValue Scale;
+    bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                      I.getParent(), VT.getScalarStoreSize());
+
+    unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+
+    if (!UniformBase) {
+      Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+      Index = getValue(Ptr);
+      IndexType = ISD::SIGNED_SCALED;
+      Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    }
+
+    EVT IdxVT = Index.getValueType();
+    EVT EltTy = IdxVT.getVectorElementType();
+    if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+      EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+      Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
+    }
+
+    SDValue Ops[] = { Root, Inc, Mask, Base, Index, Scale };
+    SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT,
+                                               sdl, Ops, MMO, IndexType);
+
+    setValue(&I, Histogram);
+    DAG.setRoot(Histogram);
+  }
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6691aa41face39..5c8d2fc9218b16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -528,6 +528,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::PATCHPOINT:
     return "patchpoint";
 
+  case ISD::EXPERIMENTAL_HISTOGRAM:     return "histogram";
+
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
   case ISD::SDID:                                                              \
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 819e8ccd5c33f0..914b3cb861e49f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1603,6 +1603,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
     }
 
+    // Histcnt is SVE2 only
+    if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
+      setOperationAction(ISD::EXPERIMENTAL_HISTOGRAM, MVT::Other, Custom);
+
     if (!Subtarget->isNeonAvailable()) {
       setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Custom);
       setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Custom);
@@ -6673,6 +6677,56 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFunnelShift(Op, DAG);
   case ISD::FLDEXP:
     return LowerFLDEXP(Op, DAG);
+  case ISD::EXPERIMENTAL_HISTOGRAM: {
+    // FIXME: Move to another function.
+    // FIXME: Maybe share some code with LowerMGather/Scatter?
+    MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
+    SDLoc DL(HG);
+    SDValue Chain = HG->getOperand(0);
+    SDValue Inc = HG->getOperand(1);
+    SDValue Mask = HG->getOperand(2);
+    SDValue Ptr = HG->getOperand(3);
+    SDValue Index = HG->getOperand(4);
+    SDValue Scale = HG->getOperand(5);
+
+    EVT IncVT = Inc.getValueType();
+    EVT IndexVT = Index.getValueType();
+    EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
+                                 IndexVT.getVectorElementCount());
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+    SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
+    SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
+    SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
+
+    // Set the MMO to load only, rather than load|store.
+    MachineMemOperand *GMMO = HG->getMemOperand();
+    GMMO->setFlags(MachineMemOperand::MOLoad);
+    ISD::MemIndexType IndexType = HG->getIndexType();
+    SDValue Gather =
+                DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL,
+                                    Ops, HG->getMemOperand(),
+                                    IndexType, ISD::NON_EXTLOAD);
+
+    SDValue GChain = Gather.getValue(1);
+
+    // Perform the histcnt, multiply by inc, add to bucket data.
+    SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
+    SDValue HistCnt = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask,
+                                  Index, Index);
+    SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
+
+
+    // Create a new MMO for the scatter.
+    MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
+        GMMO->getPointerInfo(), MachineMemOperand::MOStore,
+        GMMO->getSize(), GMMO->getAlign(), GMMO->getAAInfo());
+
+    SDValue ScatterOps[] = { GChain, Add, Mask, Ptr, Index, Scale };
+    SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
+                                           ScatterOps, SMMO, IndexType, false);
+    return Scatter;
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
new file mode 100644
index 00000000000000..5a1b771caf09d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z3.d, x0
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [z0.d]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.histogram.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt
+;;        by 1, so we should be able to remove that and directly add the histcnt to the
+;;        current bucket data.
+define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z3.s, #1 // =0x1
+; CHECK-NEXT:    histcnt z2.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mla z1.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+
+  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices
+  call void @llvm.experimental.histogram.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

``````````

</details>


https://github.com/llvm/llvm-project/pull/88106


More information about the llvm-commits mailing list