[llvm-branch-commits] [llvm] release/22.x: [SLP]Do not throttle nodes with split parents, if any of scalars is used in more than one split nodes (PR #176326)

Mon Jan 19 01:47:19 PST 2026

https://github.com/c-rhodes updated https://github.com/llvm/llvm-project/pull/176326

>From 4dd23aa76f58ae50e61570d199718b2bb967127f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 14 Jan 2026 11:56:17 -0800
Subject: [PATCH] [SLP]Do not throttle nodes with split parents, if any of
 scalars is used in more than one split nodes

If the the node to throttle is a vector node, which is used in split
node, and at least one scalar of such a node is used in many split
nodes, such vector node should be throttled. otherise there might be
wrong def-use chain, which crashes the compiler.

Fixes #175967

(cherry picked from commit c322a0c462b1b277e5862aeae2e95a40f7b130d4)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  10 +-
 .../SLPVectorizer/X86/split-node-throttled.ll | 147 ++++++++++++++++++
 2 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/split-node-throttled.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e075268a0a8e9..7808e922dd90a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16575,7 +16575,15 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
   bool Changed = false;
   while (!Worklist.empty() && Worklist.top().second.first > 0) {
     TreeEntry *TE = Worklist.top().first;
-    if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) {
+    if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
+        // Exit early if the parent node is split node and any of scalars is
+        // used in other split nodes.
+        (TE->UserTreeIndex &&
+         TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
+         any_of(TE->Scalars, [&](Value *V) {
+           ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
+           return Entries.size() > 1;
+         }))) {
       Worklist.pop();
       continue;
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-throttled.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-throttled.ll
new file mode 100644
index 0000000000000..4dd905ac3e994
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-throttled.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -mcpu=haswell < %s | FileCheck %s
+
+define fastcc void @test(i32 %arg) {
+; CHECK-LABEL: define fastcc void @test(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 false, i32 0, i32 0
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr poison, i64 16
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SELECT]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <2 x i32> [[TMP24]] to <2 x i64>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i64> [[TMP25]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP26]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i64> [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <8 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <8 x i64> [[TMP15]] to <8 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = lshr <8 x i1> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <8 x i1> [[TMP18]] to <8 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP19]], <8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP20]], <8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc <8 x i32> [[TMP21]] to <8 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i16> [[TMP22]], <8 x i16> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i16> [[TMP23]], ptr [[GETELEMENTPTR]], align 2
+; CHECK-NEXT:    ret void
+;
+bb:
+  %zext = zext i32 %arg to i64
+  %sext = sext i32 0 to i64
+  %select = select i1 false, i32 0, i32 0
+  %zext1 = zext i32 %select to i64
+  %mul = mul i64 0, %zext1
+  %or = or i64 0, %mul
+  %or2 = or i64 %or, 0
+  %zext3 = zext i32 0 to i64
+  %mul4 = mul i64 0, %zext3
+  %or5 = or i64 %or2, %mul4
+  %lshr = lshr i64 %or5, 0
+  %trunc = trunc i64 %lshr to i32
+  %call = tail call i32 @llvm.smax.i32(i32 %trunc, i32 0)
+  %call6 = tail call i32 @llvm.smin.i32(i32 %call, i32 0)
+  %trunc7 = trunc i32 %call6 to i16
+  %getelementptr = getelementptr i8, ptr poison, i64 16
+  store i16 %trunc7, ptr %getelementptr, align 2
+  %mul8 = mul i64 0, %zext
+  %or9 = or i64 0, %mul8
+  %or10 = or i64 %or9, 0
+  %sext11 = sext i32 0 to i64
+  %mul12 = mul i64 0, %sext11
+  %or13 = or i64 %or10, %mul12
+  %lshr14 = lshr i64 %or13, 0
+  %trunc15 = trunc i64 %lshr14 to i32
+  %call16 = tail call i32 @llvm.smax.i32(i32 %trunc15, i32 0)
+  %call17 = tail call i32 @llvm.smin.i32(i32 %call16, i32 0)
+  %trunc18 = trunc i32 %call17 to i16
+  %getelementptr19 = getelementptr i8, ptr poison, i64 18
+  store i16 %trunc18, ptr %getelementptr19, align 2
+  %sext20 = sext i32 0 to i64
+  %mul21 = mul i64 0, %sext20
+  %or22 = or i64 %zext3, %mul21
+  %or23 = or i64 %or22, 0
+  %mul24 = mul i64 0, %zext1
+  %or25 = or i64 %or23, %mul24
+  %lshr26 = lshr i64 %or25, 0
+  %trunc27 = trunc i64 %lshr26 to i32
+  %call28 = tail call i32 @llvm.smax.i32(i32 %trunc27, i32 0)
+  %call29 = tail call i32 @llvm.smin.i32(i32 %call28, i32 0)
+  %trunc30 = trunc i32 %call29 to i16
+  %getelementptr31 = getelementptr i8, ptr poison, i64 20
+  store i16 %trunc30, ptr %getelementptr31, align 2
+  %sext32 = sext i32 0 to i64
+  %mul33 = mul i64 0, %sext32
+  %or34 = or i64 0, %mul33
+  %or35 = or i64 %or34, 0
+  %mul36 = mul i64 0, %sext
+  %or37 = or i64 %or35, %mul36
+  %lshr38 = lshr i64 %or37, 0
+  %trunc39 = trunc i64 %lshr38 to i32
+  %call40 = tail call i32 @llvm.smax.i32(i32 %trunc39, i32 0)
+  %call41 = tail call i32 @llvm.smin.i32(i32 %call40, i32 0)
+  %trunc42 = trunc i32 %call41 to i16
+  %getelementptr43 = getelementptr i8, ptr poison, i64 22
+  store i16 %trunc42, ptr %getelementptr43, align 2
+  %mul44 = mul i64 0, %zext1
+  %or45 = or i64 0, %mul44
+  %or46 = or i64 %or45, 0
+  %mul47 = mul i64 0, %zext3
+  %or48 = or i64 %or46, %mul47
+  %lshr49 = lshr i64 %or48, 0
+  %trunc50 = trunc i64 %lshr49 to i32
+  %call51 = tail call i32 @llvm.smax.i32(i32 %trunc50, i32 0)
+  %call52 = tail call i32 @llvm.smin.i32(i32 %call51, i32 0)
+  %trunc53 = trunc i32 %call52 to i16
+  %getelementptr54 = getelementptr i8, ptr poison, i64 24
+  store i16 %trunc53, ptr %getelementptr54, align 2
+  %mul55 = mul i64 0, %zext
+  %or56 = or i64 0, %mul55
+  %or57 = or i64 %or56, 0
+  %mul58 = mul i64 0, %sext11
+  %or59 = or i64 %or57, %mul58
+  %lshr60 = lshr i64 %or59, 0
+  %trunc61 = trunc i64 %lshr60 to i32
+  %call62 = tail call i32 @llvm.smax.i32(i32 %trunc61, i32 0)
+  %call63 = tail call i32 @llvm.smin.i32(i32 %call62, i32 0)
+  %trunc64 = trunc i32 %call63 to i16
+  %getelementptr65 = getelementptr i8, ptr poison, i64 26
+  store i16 %trunc64, ptr %getelementptr65, align 2
+  %mul66 = mul i64 0, %sext20
+  %or67 = or i64 0, %mul66
+  %or68 = or i64 %or67, 0
+  %mul69 = mul i64 0, %zext1
+  %or70 = or i64 %or68, %mul69
+  %lshr71 = lshr i64 %or70, 0
+  %trunc72 = trunc i64 %lshr71 to i32
+  %call73 = tail call i32 @llvm.smax.i32(i32 %trunc72, i32 0)
+  %call74 = tail call i32 @llvm.smin.i32(i32 %call73, i32 0)
+  %trunc75 = trunc i32 %call74 to i16
+  %getelementptr76 = getelementptr i8, ptr poison, i64 28
+  store i16 %trunc75, ptr %getelementptr76, align 2
+  %mul77 = mul i64 0, %sext32
+  %or78 = or i64 0, %mul77
+  %or79 = or i64 %or78, 0
+  %mul80 = mul i64 0, %sext
+  %or81 = or i64 %or79, %mul80
+  %lshr82 = lshr i64 %or81, 0
+  %trunc83 = trunc i64 %lshr82 to i32
+  %call84 = tail call i32 @llvm.smax.i32(i32 %trunc83, i32 0)
+  %call85 = tail call i32 @llvm.smin.i32(i32 %call84, i32 0)
+  %trunc86 = trunc i32 %call85 to i16
+  %getelementptr87 = getelementptr i8, ptr poison, i64 30
+  store i16 %trunc86, ptr %getelementptr87, align 2
+  ret void
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)