[llvm] [InstCombine] Add user-count bailout to isAllocSiteRemovable (PR #190347)
JP Hafer via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 3 07:07:48 PDT 2026
https://github.com/jph-13 created https://github.com/llvm/llvm-project/pull/190347
isAllocSiteRemovable() walks all transitive users of an alloc site, but sites with many users are almost never removable. Profiling on real-world codegen workloads (73,943 alloc sites) showed:
- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)
Skip the removability analysis when direct user count exceeds a configurable threshold (default 2048, tunable via hidden cl::opt -instcombine-max-allocsite-removable-users).
Also defer WeakTrackingVH conversion: collect into Instruction* first and convert only when the site is actually removable.
>From 9f2d60687bddf719dff4b5d49f237c7f4e57c54f Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Fri, 3 Apr 2026 09:59:06 -0400
Subject: [PATCH] [InstCombine] Add user-count bailout to isAllocSiteRemovable
isAllocSiteRemovable() walks all transitive users of an alloc site,
but sites with many users are almost never removable. Profiling on
real-world codegen workloads (73,943 alloc sites) showed:
- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)
Skip the removability analysis when direct user count exceeds a
configurable threshold (default 2048, tunable via hidden cl::opt
-instcombine-max-allocsite-removable-users).
Also defer WeakTrackingVH conversion: collect into Instruction* first
and convert only when the site is actually removable.
---
.../InstCombine/InstructionCombining.cpp | 31 +++++++++++++++++--
.../Inputs/allocsite-removable-timeout-gen.py | 22 +++++++++++++
.../allocsite-removable-few-users.ll | 19 ++++++++++++
.../allocsite-removable-timeout.ll | 9 ++++++
4 files changed, 79 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6798493de1aa3..08e7db2f9248e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -144,6 +144,11 @@ static cl::opt<unsigned>
MaxArraySize("instcombine-maxarray-size", cl::init(1024),
cl::desc("Maximum array size considered when doing a combine"));
+static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
+ "instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
+ cl::desc("Maximum direct users before skipping alloc-site "
+ "removability analysis"));
+
namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
} // end namespace llvm
@@ -3732,7 +3737,7 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
}
static std::optional<ModRefInfo>
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
const TargetLibraryInfo &TLI, bool KnowInit) {
SmallVector<Instruction*, 4> Worklist;
const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
@@ -3892,6 +3897,11 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// outputs of a program (when we convert a malloc to an alloca, the fact that
// the allocation is now on the stack is potentially visible, for example),
// but we believe in a permissible manner.
+ //
+ // Collect into Instruction* first to avoid expensive WeakTrackingVH
+ // register/unregister overhead; convert to WeakTrackingVH only when the
+ // site is actually removable.
+ SmallVector<Instruction *, 64> RawUsers;
SmallVector<WeakTrackingVH, 64> Users;
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
@@ -3922,8 +3932,25 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
F.hasFnAttribute(Attribute::SanitizeAddress))
KnowInitUndef = false;
+ // Skip alloc sites with many direct users -- they are almost never removable
+ // and the transitive user walk in isAllocSiteRemovable is expensive.
+ {
+ unsigned DirectUserCount = 0;
+ for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
+ if (++DirectUserCount > MaxAllocSiteRemovableUsers)
+ break;
+ }
+ if (DirectUserCount > MaxAllocSiteRemovableUsers)
+ return nullptr;
+ }
+
auto Removable =
- isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+ isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
+ if (Removable) {
+ Users.reserve(RawUsers.size());
+ for (Instruction *I : RawUsers)
+ Users.emplace_back(I);
+ }
if (Removable) {
for (WeakTrackingVH &User : Users) {
// Lowering all @llvm.objectsize and MTI calls first because they may use
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
new file mode 100644
index 0000000000000..86a7b0f7c8fff
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Generate IR with a large non-removable alloca to stress-test
+isAllocSiteRemovable compile time."""
+
+import sys
+
+N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
+
+print("declare void @escape(ptr)")
+print()
+print("define i32 @stress() {")
+print("entry:")
+print(f" %a = alloca [{N} x i8], align 1")
+print(" %sum0 = add i32 0, 0")
+for i in range(N):
+ print(f" %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
+ print(f" %v{i} = load i8, ptr %p{i}, align 1")
+ print(f" %z{i} = zext i8 %v{i} to i32")
+ print(f" %sum{i+1} = add i32 %sum{i}, %z{i}")
+print(" call void @escape(ptr %a)")
+print(f" ret i32 %sum{N}")
+print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
new file mode 100644
index 0000000000000..5ccd691790ea1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Test that alloca with few users is still removed by
+; isAllocSiteRemovable (not blocked by user-count bailout).
+define i32 @test_few_users_still_removed() {
+; CHECK-LABEL: @test_few_users_still_removed(
+; CHECK-NEXT: ret i32 0
+;
+ %a = alloca [4 x i8], align 1
+ %p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
+ %p1 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 1
+ %v0 = load i8, ptr %p0, align 1
+ %v1 = load i8, ptr %p1, align 1
+ %z0 = zext i8 %v0 to i32
+ %z1 = zext i8 %v1 to i32
+ %sum = add i32 %z0, %z1
+ ret i32 %sum
+}
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
new file mode 100644
index 0000000000000..8e2332afda83f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
@@ -0,0 +1,9 @@
+; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
+; RUN: opt -passes=instcombine \
+; RUN: -instcombine-max-allocsite-removable-users=128 \
+; RUN: -disable-output
+
+; Compile-time regression test for isAllocSiteRemovable().
+; The generated function contains an alloca with many direct users plus one
+; escaping use, so the alloca is not removable. Without the bailout, this
+; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.
More information about the llvm-commits
mailing list