[llvm] [AArch64] Fix codegen for histograms with i64 increments (PR #181808)

Tue Feb 17 04:03:22 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

<details>
<summary>Changes</summary>

Histograms don't do any legalisation on the loaded data type, so if the 'add' would need to be performed on a vector of i64's, then we can't use the more optimal addressing with i32 offsets as that would return a vector of nxv4i32 which wouldn't get widened.

This fixes https://github.com/llvm/llvm-project/issues/181764

---
Full diff: https://github.com/llvm/llvm-project/pull/181808.diff


3 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+6-2) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+14-3) 
- (modified) llvm/test/CodeGen/AArch64/sve2-histcnt.ll (+50) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7c762ed6d91ce..fc907243df80c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6555,8 +6555,12 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   }
 
   EVT IdxVT = Index.getValueType();
-  EVT EltTy = IdxVT.getVectorElementType();
-  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+
+  // Avoid using nxv4i32 as index type when the increment must be performed
+  // on i64's.
+  bool MustExtendIndex = VT == MVT::i64 && IdxVT.getScalarSizeInBits() < 64;
+  EVT EltTy = MustExtendIndex ? VT : IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy) || MustExtendIndex) {
     EVT NewIdxVT = IdxVT.changeVectorElementType(*DAG.getContext(), EltTy);
     Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 66c22db0491d1..ba8125412c44d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26106,14 +26106,14 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
   while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
     Changed = true;
 
+  EVT IndexVT = Index.getValueType();
+  EVT DataVT = N->getOperand(1).getValueType();
+
   // Only consider element types that are pointer sized as smaller types can
   // be easily promoted.
-  EVT IndexVT = Index.getValueType();
   if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
     return Changed;
 
-  // Can indices be trivially shrunk?
-  EVT DataVT = N->getOperand(1).getValueType();
   // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
   // will later be re-extended to 64 bits in legalization
   if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
@@ -26202,6 +26202,17 @@ static SDValue performMaskedGatherScatterCombine(
                                 MSC->isTruncatingStore());
   }
   auto *HG = cast<MaskedHistogramSDNode>(MGS);
+
+  // Histograms don't do any legalisation on the loaded data type,
+  // so if the 'add' would need to be performed on a vector of i64's, then
+  // we can't use the more optimal addressing with i32 offsets as that
+  // would return a vector of nxv4i32, which wouldn't get widened.
+  if (HG->getInc().getValueType().getScalarType() == MVT::i64 &&
+      Index.getValueType().getScalarType() == MVT::i32)
+    // FIXME: If the increment value is a constant or extended value,
+    // we can truncate the increment value.
+    return SDValue();
+
   SDValue Ops[] = {Chain, HG->getInc(), Mask,          BasePtr,
                    Index, Scale,        HG->getIntID()};
   return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index 06cd65620d1c9..f8e190a29e051 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -496,4 +496,54 @@ define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <v
   ret void
 }
 
+; Test that we don't use nxv4i32 offsets when the increment must be performed as i64's.
+define void @histogram_dont_optimize_index(ptr %p, i64 %inc, <vscale x 4 x i1> %mask, <vscale x 4 x i8> %offsets) #0 {
+; CHECK-LABEL: histogram_dont_optimize_index:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z4.d, x1
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    histcnt z2.d, p1/z, z1.d, z1.d
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0, z1.d, lsl #3]
+; CHECK-NEXT:    mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT:    st1d { z2.d }, p1, [x0, z1.d, lsl #3]
+; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    mad z1.d, p2/m, z4.d, z2.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %4 = zext <vscale x 4 x i8> %offsets to <vscale x 4 x i64>
+  %5 = getelementptr i64, ptr %p, <vscale x 4 x i64> %4
+  call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 4 x ptr> %5, i64 %inc, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+; Test that we don't use nxv4i32 offsets when the increment must be performed as i64's.
+define void @histogram_sign_extend_index(ptr %p, i64 %inc, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: histogram_sign_extend_index:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpklo z1.d, z0.s
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z4.d, x1
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    histcnt z2.d, p1/z, z1.d, z1.d
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0, z1.d, lsl #3]
+; CHECK-NEXT:    mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT:    st1d { z2.d }, p1, [x0, z1.d, lsl #3]
+; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    mad z1.d, p2/m, z4.d, z2.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %5 = getelementptr i64, ptr %p, <vscale x 4 x i32> %offsets
+  call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 4 x ptr> %5, i64 %inc, <vscale x 4 x i1> %mask)
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

``````````

</details>


https://github.com/llvm/llvm-project/pull/181808