[llvm] 11c8188 - [AArch64] Improve index selection for histograms (#111150)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 22 03:14:06 PDT 2024
Author: James Chesterman
Date: 2024-10-22T11:14:00+01:00
New Revision: 11c818816d0558408eb966238bd9df5f54ac5fd0
URL: https://github.com/llvm/llvm-project/commit/11c818816d0558408eb966238bd9df5f54ac5fd0
DIFF: https://github.com/llvm/llvm-project/commit/11c818816d0558408eb966238bd9df5f54ac5fd0.diff
LOG: [AArch64] Improve index selection for histograms (#111150)
Removes unnecessary extends on the indices passed into histogram instructions. It also removes the instruction when the mask is zero.
Added:
Modified:
llvm/include/llvm/CodeGen/SelectionDAGNodes.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve2-histcnt.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 639e9311977502..bda0120a2df4aa 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -2938,8 +2938,8 @@ class MaskedGatherScatterSDNode : public MemSDNode {
const SDValue &getScale() const { return getOperand(5); }
static bool classof(const SDNode *N) {
- return N->getOpcode() == ISD::MGATHER ||
- N->getOpcode() == ISD::MSCATTER;
+ return N->getOpcode() == ISD::MGATHER || N->getOpcode() == ISD::MSCATTER ||
+ N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM;
}
};
@@ -2994,17 +2994,15 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
}
};
-class MaskedHistogramSDNode : public MemSDNode {
+class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
public:
friend class SelectionDAG;
MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
EVT MemVT, MachineMemOperand *MMO,
ISD::MemIndexType IndexType)
- : MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT,
- MMO) {
- LSBaseSDNodeBits.AddressingMode = IndexType;
- }
+ : MaskedGatherScatterSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL,
+ VTs, MemVT, MMO, IndexType) {}
ISD::MemIndexType getIndexType() const {
return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 50a75bc5932c42..ad2d2ede302af8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -549,6 +549,7 @@ namespace {
SDValue visitMSTORE(SDNode *N);
SDValue visitMGATHER(SDNode *N);
SDValue visitMSCATTER(SDNode *N);
+ SDValue visitMHISTOGRAM(SDNode *N);
SDValue visitVPGATHER(SDNode *N);
SDValue visitVPSCATTER(SDNode *N);
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1972,6 +1973,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::MLOAD: return visitMLOAD(N);
case ISD::MSCATTER: return visitMSCATTER(N);
case ISD::MSTORE: return visitMSTORE(N);
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12357,6 +12359,35 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
+ MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
+ SDValue Chain = HG->getChain();
+ SDValue Inc = HG->getInc();
+ SDValue Mask = HG->getMask();
+ SDValue BasePtr = HG->getBasePtr();
+ SDValue Index = HG->getIndex();
+ SDLoc DL(HG);
+
+ EVT MemVT = HG->getMemoryVT();
+ MachineMemOperand *MMO = HG->getMemOperand();
+ ISD::MemIndexType IndexType = HG->getIndexType();
+
+ if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+ return Chain;
+
+ SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
+ HG->getScale(), HG->getIntID()};
+ if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+ return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
+ MMO, IndexType);
+
+ EVT DataVT = Index.getValueType();
+ if (refineIndexType(Index, IndexType, DataVT, DAG))
+ return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
+ MMO, IndexType);
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
auto *SLD = cast<VPStridedLoadSDNode>(N);
EVT EltVT = SLD->getValueType(0).getVectorElementType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4aa123b42d1966..018d48b2c8f12e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1122,7 +1122,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
- setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
+ setTargetDAGCombine(
+ {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
setTargetDAGCombine(ISD::FP_EXTEND);
@@ -23821,11 +23822,9 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
static SDValue performMaskedGatherScatterCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
- MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
- assert(MGS && "Can only combine gather load or scatter store nodes");
-
if (!DCI.isBeforeLegalize())
return SDValue();
+ MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
SDLoc DL(MGS);
SDValue Chain = MGS->getChain();
@@ -23847,12 +23846,18 @@ static SDValue performMaskedGatherScatterCombine(
DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
}
- auto *MSC = cast<MaskedScatterSDNode>(MGS);
- SDValue Data = MSC->getValue();
- SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
- return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
- Ops, MSC->getMemOperand(), IndexType,
- MSC->isTruncatingStore());
+ if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
+ SDValue Data = MSC->getValue();
+ SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+ DL, Ops, MSC->getMemOperand(), IndexType,
+ MSC->isTruncatingStore());
+ }
+ auto *HG = cast<MaskedHistogramSDNode>(MGS);
+ SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
+ Index, Scale, HG->getIntID()};
+ return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
+ DL, Ops, HG->getMemOperand(), IndexType);
}
/// Target-specific DAG combine function for NEON load/store intrinsics
@@ -26019,6 +26024,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMSTORECombine(N, DCI, DAG, Subtarget);
case ISD::MGATHER:
case ISD::MSCATTER:
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return performMaskedGatherScatterCombine(N, DCI, DAG);
case ISD::FP_EXTEND:
return performFPExtendCombine(N, DAG, DCI, Subtarget);
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index dd0b9639a8fc2f..06cd65620d1c9e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -267,5 +267,233 @@ define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %i
ret void
}
+define void @histogram_i8_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0{
+; CHECK-LABEL: histogram_i8_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, w1
+; CHECK-NEXT: ld1b { z2.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1b { z1.s }, p0, [x0, z0.s, uxtw]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i8, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i16_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0{
+; CHECK-LABEL: histogram_i16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, w1
+; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i16, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_zext_from_i8_to_i64(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
+; CHECK-LABEL: histogram_zext_from_i8_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_zext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
+; CHECK-LABEL: histogram_zext_from_i16_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_sext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
+; CHECK-LABEL: histogram_sext_from_i16_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: sxth z0.s, p1/m, z0.s
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %extended = sext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_zext_from_i8_to_i32(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
+; CHECK-LABEL: histogram_zext_from_i8_to_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i32>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_zext_from_i16_to_i32(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_zext_from_i16_to_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: mov z3.s, #1 // =0x1
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i32>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_2_lane_zext(ptr %base, <vscale x 2 x i32> %indices, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_2_lane_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: mov z3.d, #1 // =0x1
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: histcnt z1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d
+; CHECK-NEXT: st1w { z1.d }, p0, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 2 x i32> %indices to <vscale x 2 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 1, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @histogram_8_lane_zext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
+; CHECK-LABEL: histogram_8_lane_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.s, #1 // =0x1
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, uxtw #2]
+; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, uxtw #2]
+; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
+; CHECK-NEXT: ret
+ %extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @histogram_8_lane_sext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
+; CHECK-LABEL: histogram_8_lane_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.s, #1 // =0x1
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: ret
+ %extended = sext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @histogram_zero_mask(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0{
+; CHECK-LABEL: histogram_zero_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> zeroinitializer)
+ ret void
+}
+
+define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{
+; CHECK-LABEL: histogram_sext_zero_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
+ %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> zeroinitializer)
+ ret void
+}
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
More information about the llvm-commits
mailing list