[llvm] [AArch64][SelectionDAG] Implement vector splitting for histogram intrinsic (PR #103037)

Wed Aug 28 10:39:40 PDT 2024

https://github.com/DevM-uk updated https://github.com/llvm/llvm-project/pull/103037

>From 79ce338d7af82fef343e4aa6f215b89d1b6cf305 Mon Sep 17 00:00:00 2001
From: Max Beck-Jones <max.beck-jones at arm.com>
Date: Mon, 12 Aug 2024 13:48:31 +0000
Subject: [PATCH 1/4] [AArch64] Implement vector splitting for histogram
 intrinsic

Adds support for wider-than-legal vector types for the histogram intrinsic (llvm.experimental.vector.histogram.add) by splitting the vector.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 39 ++++++++
 llvm/test/CodeGen/AArch64/sve2-histcnt.ll     | 95 +++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7777aa4b50a370..7c9b34b272f17d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1128,6 +1128,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
+  setTargetDAGCombine(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM);
+
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset =
@@ -25434,6 +25436,41 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
+static SDValue performHistogramCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
+  SDLoc DL(HG);
+  SDValue Chain = HG->getChain();
+  SDValue Inc = HG->getInc();
+  SDValue Mask = HG->getMask();
+  SDValue Ptr = HG->getBasePtr();
+  SDValue Index = HG->getIndex();
+  SDValue Scale = HG->getScale();
+  SDValue IntID = HG->getIntID();
+  EVT MemVT = HG->getMemoryVT();
+  EVT IndexVT = Index.getValueType();
+  MachineMemOperand *MMO = HG->getMemOperand();
+  ISD::MemIndexType IndexType = HG->getIndexType();
+
+  if (IndexVT == MVT::nxv4i32 || IndexVT == MVT::nxv2i64)
+    return SDValue();
+
+  // Split vectors which are too wide
+  SDValue IndexLo, IndexHi, MaskLo, MaskHi;
+  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
+  SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+                                          HistogramOpsLo, MMO, IndexType);
+  SDValue HistogramOpsHi[] = {HChain, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
+  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+                                HistogramOpsHi, MMO, IndexType);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -25778,6 +25815,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCTLZCombine(N, DAG, Subtarget);
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    return performHistogramCombine(N, DCI, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index 2874e47511e12f..56d5eb13ab12e3 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -169,4 +169,99 @@ define void @histogram_i16_literal_3(ptr %base, <vscale x 4 x i32> %indices, <vs
   ret void
 }
 
+define void @histogram_i64_4_lane(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64_4_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z4.d, x0
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    histcnt z2.d, p1/z, z0.d, z0.d
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [z0.d]
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT:    st1d { z2.d }, p1, [z0.d]
+; CHECK-NEXT:    histcnt z0.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [z1.d]
+; CHECK-NEXT:    mad z0.d, p2/m, z4.d, z2.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i64_8_lane(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64_8_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    mov z6.d, x0
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    punpklo p3.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    histcnt z4.d, p3/z, z0.d, z0.d
+; CHECK-NEXT:    ld1d { z5.d }, p3/z, [z0.d]
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    mad z4.d, p1/m, z6.d, z5.d
+; CHECK-NEXT:    st1d { z4.d }, p3, [z0.d]
+; CHECK-NEXT:    histcnt z0.d, p2/z, z1.d, z1.d
+; CHECK-NEXT:    ld1d { z4.d }, p2/z, [z1.d]
+; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z4.d
+; CHECK-NEXT:    st1d { z0.d }, p2, [z1.d]
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    histcnt z0.d, p2/z, z2.d, z2.d
+; CHECK-NEXT:    ld1d { z1.d }, p2/z, [z2.d]
+; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p2, [z2.d]
+; CHECK-NEXT:    histcnt z0.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [z3.d]
+; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [z3.d]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.add.nxv8p0.i64(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i32_8_lane(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_8_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z4.s, w1
+; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT:    ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT:    st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
+; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <vscale x 8 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i16_8_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z4.s, w1
+; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT:    ld1h { z3.s }, p1/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT:    st1h { z2.s }, p1, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z1.s, sxtw #1]
+; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i16, ptr %base, <vscale x 8 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 %inc, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+
 attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

>From 577383ce209661bb4402bff524083513eb75d01f Mon Sep 17 00:00:00 2001
From: Max Beck-Jones <max.beck-jones at arm.com>
Date: Tue, 20 Aug 2024 11:37:31 +0000
Subject: [PATCH 2/4] Move legalization to legalizer

Moves the splitting from a target specific DAG combine to LegalizeVectorTypes and adds promotion to LegalizeIntegerTypes. Also updates how the histogram intrinsic is custom lowered during legalize ops.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  5 ++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 11 +++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 28 +++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 49 +++----------------
 llvm/test/CodeGen/AArch64/sve2-histcnt.ll     |  8 ++-
 6 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f4b3d1a41c681b..3c94f01da0be43 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1241,6 +1241,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<MaskedHistogramSDNode>(Node)->getIndex().getValueType());
+    break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TLI.getCustomOperationAction(*Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 3635bc7a965804..98d6d8b1fc94b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2037,6 +2037,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXPERIMENTAL_VP_SPLICE:
     Res = PromoteIntOp_VP_SPLICE(N, OpNo);
     break;
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2749,6 +2752,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N,
+                                                        unsigned OpNo) {
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+  SmallVector<SDValue, 7> NewOps(N->ops());
+  NewOps[1] = GetPromotedInteger(N->getOperand(1));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 3a49a8ff10860a..af763b6159aee7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -424,6 +424,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
 
   void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -972,6 +973,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SplitVecOp_CMP(SDNode *N);
   SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N);
   SDValue SplitVecOp_VP_CttzElements(SDNode *N);
+  SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Support: LegalizeVectorTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5a21ad7ac7e2cd..c3388b895f2e8d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3264,6 +3264,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
     Res = SplitVecOp_VP_CttzElements(N);
     break;
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    Res = SplitVecOp_VECTOR_HISTOGRAM(N);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -4274,6 +4277,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) {
                        DAG.getNode(ISD::ADD, DL, ResVT, VLo, ResHi));
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
+  MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
+  SDLoc DL(HG);
+  SDValue Chain = HG->getChain();
+  SDValue Inc = HG->getInc();
+  SDValue Mask = HG->getMask();
+  SDValue Ptr = HG->getBasePtr();
+  SDValue Index = HG->getIndex();
+  SDValue Scale = HG->getScale();
+  SDValue IntID = HG->getIntID();
+  EVT MemVT = HG->getMemoryVT();
+  MachineMemOperand *MMO = HG->getMemOperand();
+  ISD::MemIndexType IndexType = HG->getIndexType();
+
+  SDValue IndexLo, IndexHi, MaskLo, MaskHi;
+  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
+  SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+                                          HistogramOpsLo, MMO, IndexType);
+  SDValue HistogramOpsHi[] = {HChain, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
+  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+                                HistogramOpsHi, MMO, IndexType);
+}
+
 //===----------------------------------------------------------------------===//
 //  Result Vector Widening
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c9b34b272f17d..5af07af38bdb7b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1128,8 +1128,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
-  setTargetDAGCombine(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM);
-
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset =
@@ -1779,10 +1777,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // Histcnt is SVE2 only
     if (Subtarget->hasSVE2()) {
-      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
+      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i8, Promote);
+      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i16, Promote);
+      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
+                         Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
-      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i8, Custom);
-      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i16, Custom);
     }
   }
 
@@ -25436,41 +25436,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
-static SDValue performHistogramCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       SelectionDAG &DAG) {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
-  SDLoc DL(HG);
-  SDValue Chain = HG->getChain();
-  SDValue Inc = HG->getInc();
-  SDValue Mask = HG->getMask();
-  SDValue Ptr = HG->getBasePtr();
-  SDValue Index = HG->getIndex();
-  SDValue Scale = HG->getScale();
-  SDValue IntID = HG->getIntID();
-  EVT MemVT = HG->getMemoryVT();
-  EVT IndexVT = Index.getValueType();
-  MachineMemOperand *MMO = HG->getMemOperand();
-  ISD::MemIndexType IndexType = HG->getIndexType();
-
-  if (IndexVT == MVT::nxv4i32 || IndexVT == MVT::nxv2i64)
-    return SDValue();
-
-  // Split vectors which are too wide
-  SDValue IndexLo, IndexHi, MaskLo, MaskHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
-  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
-  SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
-  SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
-                                          HistogramOpsLo, MMO, IndexType);
-  SDValue HistogramOpsHi[] = {HChain, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
-  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
-                                HistogramOpsHi, MMO, IndexType);
-}
-
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -25815,8 +25780,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCTLZCombine(N, DAG, Subtarget);
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
-  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
-    return performHistogramCombine(N, DCI, DAG);
   }
   return SDValue();
 }
@@ -28219,7 +28182,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
   EVT IndexVT = Index.getValueType();
   LLVMContext &Ctx = *DAG.getContext();
   ElementCount EC = IndexVT.getVectorElementCount();
-  EVT MemVT = EVT::getVectorVT(Ctx, IncVT, EC);
+  EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
   EVT IncExtVT =
       EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
   EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index 56d5eb13ab12e3..dd0b9639a8fc2f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -132,8 +132,10 @@ define void @histogram_i16_literal_1(ptr %base, <vscale x 4 x i32> %indices, <vs
 ; CHECK-LABEL: histogram_i16_literal_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z3.s, #1 // =0x1
 ; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    add z1.s, z2.s, z1.s
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT:    ret
   %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices
@@ -145,8 +147,10 @@ define void @histogram_i16_literal_2(ptr %base, <vscale x 4 x i32> %indices, <vs
 ; CHECK-LABEL: histogram_i16_literal_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z3.s, #2 // =0x2
 ; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    adr z1.s, [z2.s, z1.s, lsl #1]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT:    ret
   %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices

>From e703b0cec1a6b04aa90bfbf22da61b5cd6cfc2c8 Mon Sep 17 00:00:00 2001
From: Max Beck-Jones <max.beck-jones at arm.com>
Date: Wed, 28 Aug 2024 16:41:29 +0000
Subject: [PATCH 3/4] fixup: Remove unnecessary setOperationActions

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5af07af38bdb7b..2e36d7df061c47 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1777,8 +1777,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // Histcnt is SVE2 only
     if (Subtarget->hasSVE2()) {
-      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i8, Promote);
-      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i16, Promote);
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
                          Custom);
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,

>From a914c3c7b3e508b861d30e72b3954688fbd53124 Mon Sep 17 00:00:00 2001
From: Max Beck-Jones <max.beck-jones at arm.com>
Date: Wed, 28 Aug 2024 17:27:22 +0000
Subject: [PATCH 4/4] fixup: Review nits

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c3388b895f2e8d..10c186b1ac2ee8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4280,11 +4280,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) {
 SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
   MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
   SDLoc DL(HG);
-  SDValue Chain = HG->getChain();
   SDValue Inc = HG->getInc();
-  SDValue Mask = HG->getMask();
   SDValue Ptr = HG->getBasePtr();
-  SDValue Index = HG->getIndex();
   SDValue Scale = HG->getScale();
   SDValue IntID = HG->getIntID();
   EVT MemVT = HG->getMemoryVT();
@@ -4292,14 +4289,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
   ISD::MemIndexType IndexType = HG->getIndexType();
 
   SDValue IndexLo, IndexHi, MaskLo, MaskHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
-  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
-  SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
-  SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
-                                          HistogramOpsLo, MMO, IndexType);
-  SDValue HistogramOpsHi[] = {HChain, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
-  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
-                                HistogramOpsHi, MMO, IndexType);
+  std::tie(IndexLo, IndexHi) = DAG.SplitVector(HG->getIndex(), DL);
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(HG->getMask(), DL);
+  SDValue OpsLo[] = {HG->getChain(), Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
+  SDValue Lo = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+                                      OpsLo, MMO, IndexType);
+  SDValue OpsHi[] = {Lo, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
+  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, OpsHi,
+                                MMO, IndexType);
 }
 
 //===----------------------------------------------------------------------===//