>From 1c8845a17bcad38b8baec73616ea13497d99fd1f Mon Sep 17 00:00:00 2001
From: YAMAMOTO Takashi <yamamoto at midokura.com>
Date: Fri, 17 May 2024 14:31:17 +0900
Subject: [PATCH 1/2] [Pipelines] Perform mergefunc after constmerge

Constmerge can fold switch jump tables, possibly making functions
identical again. It can help mergefunc.
On the otherhand, the opposite seems unlikely.

Fixes https://github.com/llvm/llvm-project/issues/92201
 llvm/lib/Passes/PassBuilderPipelines.cpp | 9 +++++----
 llvm/test/Other/new-pm-defaults.ll       | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 926515c9508a97..4fd5ee1946bb77 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1527,10 +1527,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   if (EnableIROutliner)
-  // Merge functions if requested.
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
   // Now we need to do some global optimization transforms.
   // FIXME: It would seem like these should come first in the optimization
   // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
@@ -1538,6 +1534,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
+  // Merge functions if requested. It has a better chance to merge functions
+  // after ConstantMerge folded jump tables.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
   if (PTO.CallGraphProfile && !LTOPreLink)
     MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
                               LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink));
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 489aed40c190b4..588337c15625e6 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -281,9 +281,9 @@
 ; CHECK-HOT-COLD-SPLIT-NEXT: Running pass: HotColdSplittingPass
 ; CHECK-IR-OUTLINER-NEXT: Running pass: IROutlinerPass
 ; CHECK-IR-OUTLINER-NEXT: Running analysis: IRSimilarityAnalysis
-; CHECK-MERGE-FUNCS-NEXT: Running pass: MergeFunctionsPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: ConstantMergePass
+; CHECK-MERGE-FUNCS-NEXT: Running pass: MergeFunctionsPass
 ; CHECK-DEFAULT-NEXT: Running pass: CGProfilePass
 ; CHECK-DEFAULT-NEXT: Running pass: RelLookupTableConverterPass
 ; CHECK-LTO-NOT: Running pass: RelLookupTableConverterPass

>From a6d30c85d21401dde8a0a75d629c45bac8fadbd2 Mon Sep 17 00:00:00 2001
From: YAMAMOTO Takashi <yamamoto at midokura.com>
Date: Wed, 26 Jun 2024 18:49:45 +0900
Subject: [PATCH 2/2] add a few tests

 .../PhaseOrdering/X86/merge-functions2.ll     | 67 +++++++++++++++++++
 .../PhaseOrdering/X86/merge-functions3.ll     | 47 +++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/merge-functions2.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/merge-functions3.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions2.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions2.ll
new file mode 100644
index 00000000000000..5d650d5f080bac
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions2.ll
@@ -0,0 +1,67 @@
+; RUN: opt -passes="default<O3>" -enable-merge-functions -S < %s | FileCheck %s
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx12.0.0"
+; Function Attrs: noinline nounwind optsize ssp uwtable
+define i32 @f(i32 noundef %x) #0 {
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4, !tbaa !5
+  %0 = load i32, ptr %x.addr, align 4, !tbaa !5
+  switch i32 %0, label %sw.default [
+    i32 0, label %sw.bb
+    i32 2, label %sw.bb
+    i32 4, label %sw.bb
+    i32 6, label %sw.bb
+    i32 7, label %sw.bb
+  ]
+sw.bb:                                            ; preds = %entry, %entry, %entry, %entry, %entry
+  store i32 1, ptr %x.addr, align 4, !tbaa !5
+  br label %sw.epilog
+sw.default:                                       ; preds = %entry
+  store i32 0, ptr %x.addr, align 4, !tbaa !5
+  br label %sw.epilog
+sw.epilog:                                        ; preds = %sw.default, %sw.bb
+  %1 = load i32, ptr %x.addr, align 4, !tbaa !5
+  ret i32 %1
+; Function Attrs: noinline nounwind optsize ssp uwtable
+define i32 @g(i32 noundef %x) #0 {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call range(i32 0, 2) i32 @f(i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4, !tbaa !5
+  %0 = load i32, ptr %x.addr, align 4, !tbaa !5
+  switch i32 %0, label %sw.default [
+    i32 0, label %sw.bb
+    i32 2, label %sw.bb
+    i32 4, label %sw.bb
+    i32 6, label %sw.bb
+    i32 7, label %sw.bb
+  ]
+sw.bb:                                            ; preds = %entry, %entry, %entry, %entry, %entry
+  store i32 1, ptr %x.addr, align 4, !tbaa !5
+  br label %sw.epilog
+sw.default:                                       ; preds = %entry
+  store i32 0, ptr %x.addr, align 4, !tbaa !5
+  br label %sw.epilog
+sw.epilog:                                        ; preds = %sw.default, %sw.bb
+  %1 = load i32, ptr %x.addr, align 4, !tbaa !5
+  ret i32 %1
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions3.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions3.ll
new file mode 100644
index 00000000000000..bce8f08ceda5e0
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions3.ll
@@ -0,0 +1,47 @@
+; RUN: opt -passes="default<O3>" -enable-merge-functions -S < %s | FileCheck %s
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx12.0.0"
+ at switch.table.f = private unnamed_addr constant [8 x i32] [i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1], align 4
+ at switch.table.g = private unnamed_addr constant [8 x i32] [i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1], align 4
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind optsize ssp willreturn memory(none) uwtable
+define range(i32 0, 2) i32 @f(i32 noundef %x) local_unnamed_addr #0 {
+  %0 = icmp ult i32 %x, 8
+  br i1 %0, label %switch.lookup, label %sw.epilog
+switch.lookup:                                    ; preds = %entry
+  %1 = zext nneg i32 %x to i64
+  %switch.gep = getelementptr inbounds [8 x i32], ptr @switch.table.f, i64 0, i64 %1
+  %switch.load = load i32, ptr %switch.gep, align 4
+  br label %sw.epilog
+sw.epilog:                                        ; preds = %entry, %switch.lookup
+  %x.addr.0 = phi i32 [ %switch.load, %switch.lookup ], [ 0, %entry ]
+  ret i32 %x.addr.0
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind optsize ssp willreturn memory(none) uwtable
+define range(i32 0, 2) i32 @g(i32 noundef %x) local_unnamed_addr #0 {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call range(i32 0, 2) i32 @f(i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+  %0 = icmp ult i32 %x, 8
+  br i1 %0, label %switch.lookup, label %sw.epilog
+switch.lookup:                                    ; preds = %entry
+  %1 = zext nneg i32 %x to i64
+  %switch.gep = getelementptr inbounds [8 x i32], ptr @switch.table.g, i64 0, i64 %1
+  %switch.load = load i32, ptr %switch.gep, align 4
+  br label %sw.epilog
+sw.epilog:                                        ; preds = %entry, %switch.lookup
+  %x.addr.0 = phi i32 [ %switch.load, %switch.lookup ], [ 0, %entry ]
+  ret i32 %x.addr.0
+attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind optsize ssp willreturn memory(none) uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }

