[llvm] [InstCombine] Add user-count bailout to isAllocSiteRemovable (PR #190347)

JP Hafer via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 3 07:07:48 PDT 2026


https://github.com/jph-13 created https://github.com/llvm/llvm-project/pull/190347

isAllocSiteRemovable() walks all transitive users of an alloc site, but sites with many users are almost never removable. Profiling on real-world codegen workloads (73,943 alloc sites) showed:

- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)

Skip the removability analysis when direct user count exceeds a configurable threshold (default 2048, tunable via hidden cl::opt -instcombine-max-allocsite-removable-users).

Also defer WeakTrackingVH conversion: collect into Instruction* first and convert only when the site is actually removable.

>From 9f2d60687bddf719dff4b5d49f237c7f4e57c54f Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Fri, 3 Apr 2026 09:59:06 -0400
Subject: [PATCH] [InstCombine] Add user-count bailout to isAllocSiteRemovable

isAllocSiteRemovable() walks all transitive users of an alloc site,
but sites with many users are almost never removable. Profiling on
real-world codegen workloads (73,943 alloc sites) showed:

- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)

Skip the removability analysis when direct user count exceeds a
configurable threshold (default 2048, tunable via hidden cl::opt
-instcombine-max-allocsite-removable-users).

Also defer WeakTrackingVH conversion: collect into Instruction* first
and convert only when the site is actually removable.
---
 .../InstCombine/InstructionCombining.cpp      | 31 +++++++++++++++++--
 .../Inputs/allocsite-removable-timeout-gen.py | 22 +++++++++++++
 .../allocsite-removable-few-users.ll          | 19 ++++++++++++
 .../allocsite-removable-timeout.ll            |  9 ++++++
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
 create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
 create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6798493de1aa3..08e7db2f9248e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -144,6 +144,11 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
+    "instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
+    cl::desc("Maximum direct users before skipping alloc-site "
+             "removability analysis"));
+
 namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 } // end namespace llvm
@@ -3732,7 +3737,7 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
 }
 
 static std::optional<ModRefInfo>
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
                      const TargetLibraryInfo &TLI, bool KnowInit) {
   SmallVector<Instruction*, 4> Worklist;
   const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
@@ -3892,6 +3897,11 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // outputs of a program (when we convert a malloc to an alloca, the fact that
   // the allocation is now on the stack is potentially visible, for example),
   // but we believe in a permissible manner.
+  //
+  // Collect into Instruction* first to avoid expensive WeakTrackingVH
+  // register/unregister overhead; convert to WeakTrackingVH only when the
+  // site is actually removable.
+  SmallVector<Instruction *, 64> RawUsers;
   SmallVector<WeakTrackingVH, 64> Users;
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
@@ -3922,8 +3932,25 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
       F.hasFnAttribute(Attribute::SanitizeAddress))
     KnowInitUndef = false;
 
+  // Skip alloc sites with many direct users -- they are almost never removable
+  // and the transitive user walk in isAllocSiteRemovable is expensive.
+  {
+    unsigned DirectUserCount = 0;
+    for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
+      if (++DirectUserCount > MaxAllocSiteRemovableUsers)
+        break;
+    }
+    if (DirectUserCount > MaxAllocSiteRemovableUsers)
+      return nullptr;
+  }
+
   auto Removable =
-      isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+      isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
+  if (Removable) {
+    Users.reserve(RawUsers.size());
+    for (Instruction *I : RawUsers)
+      Users.emplace_back(I);
+  }
   if (Removable) {
     for (WeakTrackingVH &User : Users) {
       // Lowering all @llvm.objectsize and MTI calls first because they may use
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
new file mode 100644
index 0000000000000..86a7b0f7c8fff
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Generate IR with a large non-removable alloca to stress-test
+isAllocSiteRemovable compile time."""
+
+import sys
+
+N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
+
+print("declare void @escape(ptr)")
+print()
+print("define i32 @stress() {")
+print("entry:")
+print(f"  %a = alloca [{N} x i8], align 1")
+print("  %sum0 = add i32 0, 0")
+for i in range(N):
+    print(f"  %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
+    print(f"  %v{i} = load i8, ptr %p{i}, align 1")
+    print(f"  %z{i} = zext i8 %v{i} to i32")
+    print(f"  %sum{i+1} = add i32 %sum{i}, %z{i}")
+print("  call void @escape(ptr %a)")
+print(f"  ret i32 %sum{N}")
+print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
new file mode 100644
index 0000000000000..5ccd691790ea1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Test that alloca with few users is still removed by
+; isAllocSiteRemovable (not blocked by user-count bailout).
+define i32 @test_few_users_still_removed() {
+; CHECK-LABEL: @test_few_users_still_removed(
+; CHECK-NEXT:    ret i32 0
+;
+  %a = alloca [4 x i8], align 1
+  %p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
+  %p1 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 1
+  %v0 = load i8, ptr %p0, align 1
+  %v1 = load i8, ptr %p1, align 1
+  %z0 = zext i8 %v0 to i32
+  %z1 = zext i8 %v1 to i32
+  %sum = add i32 %z0, %z1
+  ret i32 %sum
+}
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
new file mode 100644
index 0000000000000..8e2332afda83f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
@@ -0,0 +1,9 @@
+; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
+; RUN:   opt -passes=instcombine \
+; RUN:       -instcombine-max-allocsite-removable-users=128 \
+; RUN:       -disable-output
+
+; Compile-time regression test for isAllocSiteRemovable().
+; The generated function contains an alloca with many direct users plus one
+; escaping use, so the alloca is not removable. Without the bailout, this
+; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.



More information about the llvm-commits mailing list