[llvm] [InstCombine] Add user-count bailout to isAllocSiteRemovable (PR #190347)
JP Hafer via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 5 13:38:58 PDT 2026
https://github.com/jph-13 updated https://github.com/llvm/llvm-project/pull/190347
>From 9f2d60687bddf719dff4b5d49f237c7f4e57c54f Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Fri, 3 Apr 2026 09:59:06 -0400
Subject: [PATCH 1/2] [InstCombine] Add user-count bailout to
isAllocSiteRemovable
isAllocSiteRemovable() walks all transitive users of an alloc site,
but sites with many users are almost never removable. Profiling on
real-world codegen workloads (73,943 alloc sites) showed:
- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)
Skip the removability analysis when direct user count exceeds a
configurable threshold (default 2048, tunable via hidden cl::opt
-instcombine-max-allocsite-removable-users).
Also defer WeakTrackingVH conversion: collect into Instruction* first
and convert only when the site is actually removable.
---
.../InstCombine/InstructionCombining.cpp | 31 +++++++++++++++++--
.../Inputs/allocsite-removable-timeout-gen.py | 22 +++++++++++++
.../allocsite-removable-few-users.ll | 19 ++++++++++++
.../allocsite-removable-timeout.ll | 9 ++++++
4 files changed, 79 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6798493de1aa3..08e7db2f9248e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -144,6 +144,11 @@ static cl::opt<unsigned>
MaxArraySize("instcombine-maxarray-size", cl::init(1024),
cl::desc("Maximum array size considered when doing a combine"));
+static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
+ "instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
+ cl::desc("Maximum direct users before skipping alloc-site "
+ "removability analysis"));
+
namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
} // end namespace llvm
@@ -3732,7 +3737,7 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
}
static std::optional<ModRefInfo>
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
const TargetLibraryInfo &TLI, bool KnowInit) {
SmallVector<Instruction*, 4> Worklist;
const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
@@ -3892,6 +3897,11 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// outputs of a program (when we convert a malloc to an alloca, the fact that
// the allocation is now on the stack is potentially visible, for example),
// but we believe in a permissible manner.
+ //
+ // Collect into Instruction* first to avoid expensive WeakTrackingVH
+ // register/unregister overhead; convert to WeakTrackingVH only when the
+ // site is actually removable.
+ SmallVector<Instruction *, 64> RawUsers;
SmallVector<WeakTrackingVH, 64> Users;
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
@@ -3922,8 +3932,25 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
F.hasFnAttribute(Attribute::SanitizeAddress))
KnowInitUndef = false;
+ // Skip alloc sites with many direct users -- they are almost never removable
+ // and the transitive user walk in isAllocSiteRemovable is expensive.
+ {
+ unsigned DirectUserCount = 0;
+ for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
+ if (++DirectUserCount > MaxAllocSiteRemovableUsers)
+ break;
+ }
+ if (DirectUserCount > MaxAllocSiteRemovableUsers)
+ return nullptr;
+ }
+
auto Removable =
- isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+ isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
+ if (Removable) {
+ Users.reserve(RawUsers.size());
+ for (Instruction *I : RawUsers)
+ Users.emplace_back(I);
+ }
if (Removable) {
for (WeakTrackingVH &User : Users) {
// Lowering all @llvm.objectsize and MTI calls first because they may use
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
new file mode 100644
index 0000000000000..86a7b0f7c8fff
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Generate IR with a large non-removable alloca to stress-test
+isAllocSiteRemovable compile time."""
+
+import sys
+
+N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
+
+print("declare void @escape(ptr)")
+print()
+print("define i32 @stress() {")
+print("entry:")
+print(f" %a = alloca [{N} x i8], align 1")
+print(" %sum0 = add i32 0, 0")
+for i in range(N):
+ print(f" %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
+ print(f" %v{i} = load i8, ptr %p{i}, align 1")
+ print(f" %z{i} = zext i8 %v{i} to i32")
+ print(f" %sum{i+1} = add i32 %sum{i}, %z{i}")
+print(" call void @escape(ptr %a)")
+print(f" ret i32 %sum{N}")
+print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
new file mode 100644
index 0000000000000..5ccd691790ea1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Test that alloca with few users is still removed by
+; isAllocSiteRemovable (not blocked by user-count bailout).
+define i32 @test_few_users_still_removed() {
+; CHECK-LABEL: @test_few_users_still_removed(
+; CHECK-NEXT: ret i32 0
+;
+ %a = alloca [4 x i8], align 1
+ %p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
+ %p1 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 1
+ %v0 = load i8, ptr %p0, align 1
+ %v1 = load i8, ptr %p1, align 1
+ %z0 = zext i8 %v0 to i32
+ %z1 = zext i8 %v1 to i32
+ %sum = add i32 %z0, %z1
+ ret i32 %sum
+}
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
new file mode 100644
index 0000000000000..8e2332afda83f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
@@ -0,0 +1,9 @@
+; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
+; RUN: opt -passes=instcombine \
+; RUN: -instcombine-max-allocsite-removable-users=128 \
+; RUN: -disable-output
+
+; Compile-time regression test for isAllocSiteRemovable().
+; The generated function contains an alloca with many direct users plus one
+; escaping use, so the alloca is not removable. Without the bailout, this
+; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.
>From 5b371c9f6dfc4755b1bded2920a042829fcbd971 Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Sun, 5 Apr 2026 16:38:33 -0400
Subject: [PATCH 2/2] fixup! [InstCombine] Add user-count bailout to
isAllocSiteRemovable
---
.../InstCombine/InstructionCombining.cpp | 23 ++++---------------
.../Inputs/allocsite-removable-timeout-gen.py | 22 ------------------
.../allocsite-removable-few-users.ll | 18 ++++++++++++---
.../allocsite-removable-timeout.ll | 9 --------
4 files changed, 19 insertions(+), 53 deletions(-)
delete mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
delete mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 08e7db2f9248e..7b0bfab7edda5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -146,7 +146,7 @@ MaxArraySize("instcombine-maxarray-size", cl::init(1024),
static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
"instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
- cl::desc("Maximum direct users before skipping alloc-site "
+ cl::desc("Maximum number of users to visit in alloc-site "
"removability analysis"));
namespace llvm {
@@ -3748,6 +3748,8 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
Instruction *PI = Worklist.pop_back_val();
for (User *U : PI->users()) {
Instruction *I = cast<Instruction>(U);
+ if (Users.size() >= MaxAllocSiteRemovableUsers)
+ return std::nullopt;
switch (I->getOpcode()) {
default:
// Give up the moment we see something we can't handle.
@@ -3902,7 +3904,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// register/unregister overhead; convert to WeakTrackingVH only when the
// site is actually removable.
SmallVector<Instruction *, 64> RawUsers;
- SmallVector<WeakTrackingVH, 64> Users;
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
// before each store.
@@ -3932,26 +3933,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
F.hasFnAttribute(Attribute::SanitizeAddress))
KnowInitUndef = false;
- // Skip alloc sites with many direct users -- they are almost never removable
- // and the transitive user walk in isAllocSiteRemovable is expensive.
- {
- unsigned DirectUserCount = 0;
- for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
- if (++DirectUserCount > MaxAllocSiteRemovableUsers)
- break;
- }
- if (DirectUserCount > MaxAllocSiteRemovableUsers)
- return nullptr;
- }
-
auto Removable =
isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
if (Removable) {
- Users.reserve(RawUsers.size());
- for (Instruction *I : RawUsers)
- Users.emplace_back(I);
- }
- if (Removable) {
+ SmallVector<WeakTrackingVH, 64> Users(RawUsers.begin(), RawUsers.end());
for (WeakTrackingVH &User : Users) {
// Lowering all @llvm.objectsize and MTI calls first because they may use
// a bitcast/GEP of the alloca we are removing.
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
deleted file mode 100644
index 86a7b0f7c8fff..0000000000000
--- a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-"""Generate IR with a large non-removable alloca to stress-test
-isAllocSiteRemovable compile time."""
-
-import sys
-
-N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
-
-print("declare void @escape(ptr)")
-print()
-print("define i32 @stress() {")
-print("entry:")
-print(f" %a = alloca [{N} x i8], align 1")
-print(" %sum0 = add i32 0, 0")
-for i in range(N):
- print(f" %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
- print(f" %v{i} = load i8, ptr %p{i}, align 1")
- print(f" %z{i} = zext i8 %v{i} to i32")
- print(f" %sum{i+1} = add i32 %sum{i}, %z{i}")
-print(" call void @escape(ptr %a)")
-print(f" ret i32 %sum{N}")
-print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
index 5ccd691790ea1..e05ff2b23559f 100644
--- a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -1,11 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s --check-prefix=OPT
+; RUN: opt -passes=instcombine -instcombine-max-allocsite-removable-users=1 -S < %s | FileCheck %s --check-prefix=NOOPT
; Test that alloca with few users is still removed by
; isAllocSiteRemovable (not blocked by user-count bailout).
+; With a threshold of 1, the alloca is not removed.
define i32 @test_few_users_still_removed() {
-; CHECK-LABEL: @test_few_users_still_removed(
-; CHECK-NEXT: ret i32 0
+; OPT-LABEL: @test_few_users_still_removed(
+; OPT-NEXT: ret i32 0
+;
+; NOOPT-LABEL: @test_few_users_still_removed(
+; NOOPT-NEXT: [[A:%.*]] = alloca [4 x i8], align 1
+; NOOPT-NEXT: [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 1
+; NOOPT-NEXT: [[V0:%.*]] = load i8, ptr [[A]], align 1
+; NOOPT-NEXT: [[V1:%.*]] = load i8, ptr [[P1]], align 1
+; NOOPT-NEXT: [[Z0:%.*]] = zext i8 [[V0]] to i32
+; NOOPT-NEXT: [[Z1:%.*]] = zext i8 [[V1]] to i32
+; NOOPT-NEXT: [[SUM:%.*]] = add nuw nsw i32 [[Z0]], [[Z1]]
+; NOOPT-NEXT: ret i32 [[SUM]]
;
%a = alloca [4 x i8], align 1
%p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
deleted file mode 100644
index 8e2332afda83f..0000000000000
--- a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
-; RUN: opt -passes=instcombine \
-; RUN: -instcombine-max-allocsite-removable-users=128 \
-; RUN: -disable-output
-
-; Compile-time regression test for isAllocSiteRemovable().
-; The generated function contains an alloca with many direct users plus one
-; escaping use, so the alloca is not removable. Without the bailout, this
-; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.
More information about the llvm-commits
mailing list