[llvm] [InstCombine] Add user-count bailout to isAllocSiteRemovable (PR #190347)

JP Hafer via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 5 13:38:58 PDT 2026


https://github.com/jph-13 updated https://github.com/llvm/llvm-project/pull/190347

>From 9f2d60687bddf719dff4b5d49f237c7f4e57c54f Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Fri, 3 Apr 2026 09:59:06 -0400
Subject: [PATCH 1/2] [InstCombine] Add user-count bailout to
 isAllocSiteRemovable

isAllocSiteRemovable() walks all transitive users of an alloc site,
but sites with many users are almost never removable. Profiling on
real-world codegen workloads (73,943 alloc sites) showed:

- 89 removable sites, max 1,392 users walked
- 73,854 non-removable sites, avg 31,305 users walked
- 2.31B total wasted user visits (~400s wall-clock on a 35-min build)

Skip the removability analysis when direct user count exceeds a
configurable threshold (default 2048, tunable via hidden cl::opt
-instcombine-max-allocsite-removable-users).

Also defer WeakTrackingVH conversion: collect into Instruction* first
and convert only when the site is actually removable.
---
 .../InstCombine/InstructionCombining.cpp      | 31 +++++++++++++++++--
 .../Inputs/allocsite-removable-timeout-gen.py | 22 +++++++++++++
 .../allocsite-removable-few-users.ll          | 19 ++++++++++++
 .../allocsite-removable-timeout.ll            |  9 ++++++
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
 create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
 create mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6798493de1aa3..08e7db2f9248e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -144,6 +144,11 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
+    "instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
+    cl::desc("Maximum direct users before skipping alloc-site "
+             "removability analysis"));
+
 namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 } // end namespace llvm
@@ -3732,7 +3737,7 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
 }
 
 static std::optional<ModRefInfo>
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
                      const TargetLibraryInfo &TLI, bool KnowInit) {
   SmallVector<Instruction*, 4> Worklist;
   const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
@@ -3892,6 +3897,11 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // outputs of a program (when we convert a malloc to an alloca, the fact that
   // the allocation is now on the stack is potentially visible, for example),
   // but we believe in a permissible manner.
+  //
+  // Collect into Instruction* first to avoid expensive WeakTrackingVH
+  // register/unregister overhead; convert to WeakTrackingVH only when the
+  // site is actually removable.
+  SmallVector<Instruction *, 64> RawUsers;
   SmallVector<WeakTrackingVH, 64> Users;
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
@@ -3922,8 +3932,25 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
       F.hasFnAttribute(Attribute::SanitizeAddress))
     KnowInitUndef = false;
 
+  // Skip alloc sites with many direct users -- they are almost never removable
+  // and the transitive user walk in isAllocSiteRemovable is expensive.
+  {
+    unsigned DirectUserCount = 0;
+    for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
+      if (++DirectUserCount > MaxAllocSiteRemovableUsers)
+        break;
+    }
+    if (DirectUserCount > MaxAllocSiteRemovableUsers)
+      return nullptr;
+  }
+
   auto Removable =
-      isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+      isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
+  if (Removable) {
+    Users.reserve(RawUsers.size());
+    for (Instruction *I : RawUsers)
+      Users.emplace_back(I);
+  }
   if (Removable) {
     for (WeakTrackingVH &User : Users) {
       // Lowering all @llvm.objectsize and MTI calls first because they may use
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
new file mode 100644
index 0000000000000..86a7b0f7c8fff
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""Generate IR with a large non-removable alloca to stress-test
+isAllocSiteRemovable compile time."""
+
+import sys
+
+N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
+
+print("declare void @escape(ptr)")
+print()
+print("define i32 @stress() {")
+print("entry:")
+print(f"  %a = alloca [{N} x i8], align 1")
+print("  %sum0 = add i32 0, 0")
+for i in range(N):
+    print(f"  %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
+    print(f"  %v{i} = load i8, ptr %p{i}, align 1")
+    print(f"  %z{i} = zext i8 %v{i} to i32")
+    print(f"  %sum{i+1} = add i32 %sum{i}, %z{i}")
+print("  call void @escape(ptr %a)")
+print(f"  ret i32 %sum{N}")
+print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
new file mode 100644
index 0000000000000..5ccd691790ea1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Test that alloca with few users is still removed by
+; isAllocSiteRemovable (not blocked by user-count bailout).
+define i32 @test_few_users_still_removed() {
+; CHECK-LABEL: @test_few_users_still_removed(
+; CHECK-NEXT:    ret i32 0
+;
+  %a = alloca [4 x i8], align 1
+  %p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
+  %p1 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 1
+  %v0 = load i8, ptr %p0, align 1
+  %v1 = load i8, ptr %p1, align 1
+  %z0 = zext i8 %v0 to i32
+  %z1 = zext i8 %v1 to i32
+  %sum = add i32 %z0, %z1
+  ret i32 %sum
+}
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
new file mode 100644
index 0000000000000..8e2332afda83f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
@@ -0,0 +1,9 @@
+; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
+; RUN:   opt -passes=instcombine \
+; RUN:       -instcombine-max-allocsite-removable-users=128 \
+; RUN:       -disable-output
+
+; Compile-time regression test for isAllocSiteRemovable().
+; The generated function contains an alloca with many direct users plus one
+; escaping use, so the alloca is not removable. Without the bailout, this
+; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.

>From 5b371c9f6dfc4755b1bded2920a042829fcbd971 Mon Sep 17 00:00:00 2001
From: Jason Hafer <jhafer at mathworks.com>
Date: Sun, 5 Apr 2026 16:38:33 -0400
Subject: [PATCH 2/2] fixup! [InstCombine] Add user-count bailout to
 isAllocSiteRemovable

---
 .../InstCombine/InstructionCombining.cpp      | 23 ++++---------------
 .../Inputs/allocsite-removable-timeout-gen.py | 22 ------------------
 .../allocsite-removable-few-users.ll          | 18 ++++++++++++---
 .../allocsite-removable-timeout.ll            |  9 --------
 4 files changed, 19 insertions(+), 53 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
 delete mode 100644 llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 08e7db2f9248e..7b0bfab7edda5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -146,7 +146,7 @@ MaxArraySize("instcombine-maxarray-size", cl::init(1024),
 
 static cl::opt<unsigned> MaxAllocSiteRemovableUsers(
     "instcombine-max-allocsite-removable-users", cl::Hidden, cl::init(2048),
-    cl::desc("Maximum direct users before skipping alloc-site "
+    cl::desc("Maximum number of users to visit in alloc-site "
              "removability analysis"));
 
 namespace llvm {
@@ -3748,6 +3748,8 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<Instruction *> &Users,
     Instruction *PI = Worklist.pop_back_val();
     for (User *U : PI->users()) {
       Instruction *I = cast<Instruction>(U);
+      if (Users.size() >= MaxAllocSiteRemovableUsers)
+        return std::nullopt;
       switch (I->getOpcode()) {
       default:
         // Give up the moment we see something we can't handle.
@@ -3902,7 +3904,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // register/unregister overhead; convert to WeakTrackingVH only when the
   // site is actually removable.
   SmallVector<Instruction *, 64> RawUsers;
-  SmallVector<WeakTrackingVH, 64> Users;
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
   // before each store.
@@ -3932,26 +3933,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
       F.hasFnAttribute(Attribute::SanitizeAddress))
     KnowInitUndef = false;
 
-  // Skip alloc sites with many direct users -- they are almost never removable
-  // and the transitive user walk in isAllocSiteRemovable is expensive.
-  {
-    unsigned DirectUserCount = 0;
-    for (auto UI = MI.user_begin(), UE = MI.user_end(); UI != UE; ++UI) {
-      if (++DirectUserCount > MaxAllocSiteRemovableUsers)
-        break;
-    }
-    if (DirectUserCount > MaxAllocSiteRemovableUsers)
-      return nullptr;
-  }
-
   auto Removable =
       isAllocSiteRemovable(&MI, RawUsers, TLI, KnowInitZero | KnowInitUndef);
   if (Removable) {
-    Users.reserve(RawUsers.size());
-    for (Instruction *I : RawUsers)
-      Users.emplace_back(I);
-  }
-  if (Removable) {
+    SmallVector<WeakTrackingVH, 64> Users(RawUsers.begin(), RawUsers.end());
     for (WeakTrackingVH &User : Users) {
       // Lowering all @llvm.objectsize and MTI calls first because they may use
       // a bitcast/GEP of the alloca we are removing.
diff --git a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py b/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
deleted file mode 100644
index 86a7b0f7c8fff..0000000000000
--- a/llvm/test/Transforms/InstCombine/Inputs/allocsite-removable-timeout-gen.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-"""Generate IR with a large non-removable alloca to stress-test
-isAllocSiteRemovable compile time."""
-
-import sys
-
-N = int(sys.argv[1]) if len(sys.argv) > 1 else 12000
-
-print("declare void @escape(ptr)")
-print()
-print("define i32 @stress() {")
-print("entry:")
-print(f"  %a = alloca [{N} x i8], align 1")
-print("  %sum0 = add i32 0, 0")
-for i in range(N):
-    print(f"  %p{i} = getelementptr inbounds [{N} x i8], ptr %a, i64 0, i64 {i}")
-    print(f"  %v{i} = load i8, ptr %p{i}, align 1")
-    print(f"  %z{i} = zext i8 %v{i} to i32")
-    print(f"  %sum{i+1} = add i32 %sum{i}, %z{i}")
-print("  call void @escape(ptr %a)")
-print(f"  ret i32 %sum{N}")
-print("}")
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
index 5ccd691790ea1..e05ff2b23559f 100644
--- a/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
+++ b/llvm/test/Transforms/InstCombine/allocsite-removable-few-users.ll
@@ -1,11 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s --check-prefix=OPT
+; RUN: opt -passes=instcombine -instcombine-max-allocsite-removable-users=1 -S < %s | FileCheck %s --check-prefix=NOOPT
 
 ; Test that alloca with few users is still removed by
 ; isAllocSiteRemovable (not blocked by user-count bailout).
+; With a threshold of 1, the alloca is not removed.
 define i32 @test_few_users_still_removed() {
-; CHECK-LABEL: @test_few_users_still_removed(
-; CHECK-NEXT:    ret i32 0
+; OPT-LABEL: @test_few_users_still_removed(
+; OPT-NEXT:    ret i32 0
+;
+; NOOPT-LABEL: @test_few_users_still_removed(
+; NOOPT-NEXT:    [[A:%.*]] = alloca [4 x i8], align 1
+; NOOPT-NEXT:    [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 1
+; NOOPT-NEXT:    [[V0:%.*]] = load i8, ptr [[A]], align 1
+; NOOPT-NEXT:    [[V1:%.*]] = load i8, ptr [[P1]], align 1
+; NOOPT-NEXT:    [[Z0:%.*]] = zext i8 [[V0]] to i32
+; NOOPT-NEXT:    [[Z1:%.*]] = zext i8 [[V1]] to i32
+; NOOPT-NEXT:    [[SUM:%.*]] = add nuw nsw i32 [[Z0]], [[Z1]]
+; NOOPT-NEXT:    ret i32 [[SUM]]
 ;
   %a = alloca [4 x i8], align 1
   %p0 = getelementptr inbounds [4 x i8], ptr %a, i64 0, i64 0
diff --git a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll b/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
deleted file mode 100644
index 8e2332afda83f..0000000000000
--- a/llvm/test/Transforms/InstCombine/allocsite-removable-timeout.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: %python %S/Inputs/allocsite-removable-timeout-gen.py 12000 | \
-; RUN:   opt -passes=instcombine \
-; RUN:       -instcombine-max-allocsite-removable-users=128 \
-; RUN:       -disable-output
-
-; Compile-time regression test for isAllocSiteRemovable().
-; The generated function contains an alloca with many direct users plus one
-; escaping use, so the alloca is not removable. Without the bailout, this
-; takes O(N^2) time in the user walk. Success = InstCombine finishes quickly.



More information about the llvm-commits mailing list