[llvm] [AArch64][SVE] Fix hang when in VECTOR_HISTOGRAM DAG combine (PR #152539)

Thu Aug 7 09:01:58 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Benjamin Maxwell (MacDue)

<details>
<summary>Changes</summary>

The histogram DAG combine went into an infinite loop of creating the same histogram node due to an incorrect use of the `refineUniformBase` and `refineIndexType` APIs.

These APIs take SDValues by reference (SDValue&) and return `true` if they were "refined" (i.e., set them to new values).

Previously, this DAG combine would create the `Ops` array (used to create the new histogram node) before calling the `refine*` APIs, which copies the SDValues into the array, meaning the updated values were not used to create the new histogram node.

Reproducer: https://godbolt.org/z/hsGWhTaqY (it will timeout)

---
Full diff: https://github.com/llvm/llvm-project/pull/152539.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+6-7) 
- (added) llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll (+70) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 734191447d67f..5f1e38a33c891 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12843,22 +12843,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
   SDLoc DL(HG);
 
   EVT MemVT = HG->getMemoryVT();
+  EVT DataVT = Index.getValueType();
   MachineMemOperand *MMO = HG->getMemOperand();
   ISD::MemIndexType IndexType = HG->getIndexType();
 
   if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  SDValue Ops[] = {Chain,          Inc,           Mask, BasePtr, Index,
-                   HG->getScale(), HG->getIntID()};
-  if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+  if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+      refineIndexType(Index, IndexType, DataVT, DAG)) {
+    SDValue Ops[] = {Chain,          Inc,           Mask, BasePtr, Index,
+                     HG->getScale(), HG->getIntID()};
     return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
                                   MMO, IndexType);
+  }
 
-  EVT DataVT = Index.getValueType();
-  if (refineIndexType(Index, IndexType, DataVT, DAG))
-    return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
-                                  MMO, IndexType);
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
new file mode 100644
index 0000000000000..da04c67aa6c5c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2 -verify-machineinstrs < %s -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test is reduced from a real world example that would cause the DAGCombiner to hang.
+
+define void @histcnt_loop(ptr %0, i64 %1, ptr %2, i64 %3, i64 %4) {
+; CHECK-LABEL: histcnt_loop:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:  .LBB0_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    lsl x10, x8, #1
+; CHECK-NEXT:    add x11, x0, x10
+; CHECK-NEXT:    add x10, x9, x10
+; CHECK-NEXT:    lsl z1.d, z1.d, #1
+; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x11, #1, mul vl]
+; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x10, #1, mul vl]
+; CHECK-NEXT:    histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x9, x8, lsl #1]
+; CHECK-NEXT:    add x8, x8, x3
+; CHECK-NEXT:    cmp x4, x8
+; CHECK-NEXT:    st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    lsl z1.d, z4.d, #1
+; CHECK-NEXT:    histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    mad z2.d, p0/m, z0.d, z4.d
+; CHECK-NEXT:    st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    lsl z1.d, z3.d, #1
+; CHECK-NEXT:    histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT:    st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    lsl z1.d, z5.d, #1
+; CHECK-NEXT:    histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT:    st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %6 = phi i64 [ 0, %entry ], [ %15, %loop ]
+  %7 = getelementptr inbounds nuw i16, ptr %0, i64 %6
+  %8 = getelementptr inbounds nuw i8, ptr %7, i64 %1
+  %9 = load <vscale x 4 x i16>, ptr %7, align 2
+  %10 = load <vscale x 4 x i16>, ptr %8, align 2
+  %11 = zext <vscale x 4 x i16> %9 to <vscale x 4 x i64>
+  %12 = zext <vscale x 4 x i16> %10 to <vscale x 4 x i64>
+  %13 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %11
+  %14 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %12
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %13, i16 1, <vscale x 4 x i1> splat (i1 true))
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %14, i16 1, <vscale x 4 x i1> splat (i1 true))
+  %15 = add nuw i64 %6, %3
+  %16 = icmp eq i64 %15, %4
+  br i1 %16, label %exit, label %loop
+
+exit:
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/152539