[llvm] [AArch64] Fix codegen for histograms with i64 increments (PR #181808)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 17 04:03:22 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sander de Smalen (sdesmalen-arm)
<details>
<summary>Changes</summary>
Histograms don't do any legalisation on the loaded data type, so if the 'add' would need to be performed on a vector of i64's, then we can't use the more optimal addressing with i32 offsets as that would return a vector of nxv4i32 which wouldn't get widened.
This fixes https://github.com/llvm/llvm-project/issues/181764
---
Full diff: https://github.com/llvm/llvm-project/pull/181808.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+6-2)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+14-3)
- (modified) llvm/test/CodeGen/AArch64/sve2-histcnt.ll (+50)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7c762ed6d91ce..fc907243df80c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6555,8 +6555,12 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
}
EVT IdxVT = Index.getValueType();
- EVT EltTy = IdxVT.getVectorElementType();
- if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+
+ // Avoid using nxv4i32 as index type when the increment must be performed
+ // on i64's.
+ bool MustExtendIndex = VT == MVT::i64 && IdxVT.getScalarSizeInBits() < 64;
+ EVT EltTy = MustExtendIndex ? VT : IdxVT.getVectorElementType();
+ if (TLI.shouldExtendGSIndex(IdxVT, EltTy) || MustExtendIndex) {
EVT NewIdxVT = IdxVT.changeVectorElementType(*DAG.getContext(), EltTy);
Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 66c22db0491d1..ba8125412c44d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26106,14 +26106,14 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
Changed = true;
+ EVT IndexVT = Index.getValueType();
+ EVT DataVT = N->getOperand(1).getValueType();
+
// Only consider element types that are pointer sized as smaller types can
// be easily promoted.
- EVT IndexVT = Index.getValueType();
if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
return Changed;
- // Can indices be trivially shrunk?
- EVT DataVT = N->getOperand(1).getValueType();
// Don't attempt to shrink the index for fixed vectors of 64 bit data since it
// will later be re-extended to 64 bits in legalization
if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
@@ -26202,6 +26202,17 @@ static SDValue performMaskedGatherScatterCombine(
MSC->isTruncatingStore());
}
auto *HG = cast<MaskedHistogramSDNode>(MGS);
+
+ // Histograms don't do any legalisation on the loaded data type,
+ // so if the 'add' would need to be performed on a vector of i64's, then
+ // we can't use the more optimal addressing with i32 offsets as that
+ // would return a vector of nxv4i32, which wouldn't get widened.
+ if (HG->getInc().getValueType().getScalarType() == MVT::i64 &&
+ Index.getValueType().getScalarType() == MVT::i32)
+ // FIXME: If the increment value is a constant or extended value,
+ // we can truncate the increment value.
+ return SDValue();
+
SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
Index, Scale, HG->getIntID()};
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index 06cd65620d1c9..f8e190a29e051 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -496,4 +496,54 @@ define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <v
ret void
}
+; Test that we don't use nxv4i32 offsets when the increment must be performed as i64's.
+define void @histogram_dont_optimize_index(ptr %p, i64 %inc, <vscale x 4 x i1> %mask, <vscale x 4 x i8> %offsets) #0 {
+; CHECK-LABEL: histogram_dont_optimize_index:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.d, x1
+; CHECK-NEXT: ptrue p2.d
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: histcnt z2.d, p1/z, z1.d, z1.d
+; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0, z1.d, lsl #3]
+; CHECK-NEXT: mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT: st1d { z2.d }, p1, [x0, z1.d, lsl #3]
+; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: mad z1.d, p2/m, z4.d, z2.d
+; CHECK-NEXT: st1d { z1.d }, p0, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %4 = zext <vscale x 4 x i8> %offsets to <vscale x 4 x i64>
+ %5 = getelementptr i64, ptr %p, <vscale x 4 x i64> %4
+ call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 4 x ptr> %5, i64 %inc, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; Test that we don't use nxv4i32 offsets when the increment must be performed as i64's.
+define void @histogram_sign_extend_index(ptr %p, i64 %inc, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: histogram_sign_extend_index:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpklo z1.d, z0.s
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.d, x1
+; CHECK-NEXT: ptrue p2.d
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: histcnt z2.d, p1/z, z1.d, z1.d
+; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0, z1.d, lsl #3]
+; CHECK-NEXT: mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT: st1d { z2.d }, p1, [x0, z1.d, lsl #3]
+; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: mad z1.d, p2/m, z4.d, z2.d
+; CHECK-NEXT: st1d { z1.d }, p0, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %5 = getelementptr i64, ptr %p, <vscale x 4 x i32> %offsets
+ call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 4 x ptr> %5, i64 %inc, <vscale x 4 x i1> %mask)
+ ret void
+}
+
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
``````````
</details>
https://github.com/llvm/llvm-project/pull/181808
More information about the llvm-commits
mailing list