[llvm] [Inliner] Don't count a call penalty for foldable __memcpy_chk and similar (PR #117876)

Marina Taylor via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 28 07:13:19 PST 2024


https://github.com/citymarina updated https://github.com/llvm/llvm-project/pull/117876

>From e6bb6da0983e60c9ae8b7a213eedb2db88a0e079 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Tue, 26 Nov 2024 22:07:55 +0000
Subject: [PATCH 1/3] [Inliner] Add tests for memcpy of small constant size.
 NFC

---
 .../Inline/AArch64/memcpy-constant-size.ll    | 88 ++++++++++++++++++
 .../AArch64/memcpy-constant-size.ll           | 92 +++++++++++++++++++
 2 files changed, 180 insertions(+)
 create mode 100644 llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll

diff --git a/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
new file mode 100644
index 00000000000000..b1e9cc2e928f2b
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes=inline -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+
+declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
+declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
+declare ptr @__memmove_chk(ptr, ptr, i64, i64)
+declare ptr @__mempcpy_chk(ptr, ptr, i64, i64)
+declare ptr @__memset_chk(ptr, i32, i64, i64)
+
+define void @callee(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @callee
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[OBJSIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[DST]], i1 false, i1 true, i1 false)
+; CHECK-NEXT:    [[CALL_MEMCPY:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]])
+; CHECK-NEXT:    [[CALL_MEMMOVE:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]])
+; CHECK-NEXT:    [[CALL_MEMPCPY:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]])
+; CHECK-NEXT:    [[CALL_MEMSET:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 [[SIZE]], i64 [[OBJSIZE]])
+; CHECK-NEXT:    ret void
+;
+  %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false)
+  %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  %call.memset = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 %objsize)
+  ret void
+}
+
+define void @caller(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @caller
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    call void @callee(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @callee(ptr %dst, ptr %src, i64 4)
+  ret void
+}
+
+define void @objsize_toosmall_callee(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @objsize_toosmall_callee
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[CALL_MEMCPY:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1)
+; CHECK-NEXT:    [[CALL_MEMMOVE:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1)
+; CHECK-NEXT:    [[CALL_MEMPCPY:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1)
+; CHECK-NEXT:    [[CALL_MEMSET:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 [[SIZE]], i64 1)
+; CHECK-NEXT:    ret void
+;
+  %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 1)
+  %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 1)
+  %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 1)
+  %call.memset = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 1)
+  ret void
+}
+
+define void @objsize_toosmall_caller(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @objsize_toosmall_caller
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    call void @objsize_toosmall_callee(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @objsize_toosmall_callee(ptr %dst, ptr %src, i64 4)
+  ret void
+}
+
+define void @intrinsics_callee(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @intrinsics_callee
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST]], i8 0, i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %size, i1 false)
+  call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 %size, i1 false)
+  call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 %size, i1 false)
+  ret void
+}
+
+define void @intrinsics_caller(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @intrinsics_caller
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST]], i8 0, i64 4, i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @intrinsics_callee(ptr %dst, ptr %src, i64 4)
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
new file mode 100644
index 00000000000000..8c1e4c5ac80072
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default<O3>' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+
+declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
+declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
+declare ptr @__memmove_chk(ptr, ptr, i64, i64)
+declare ptr @__mempcpy_chk(ptr, ptr, i64, i64)
+declare ptr @__memset_chk(ptr, i32, i64, i64)
+
+define void @callee_memcpy(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @callee_memcpy
+; CHECK-SAME: (ptr [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false)
+  %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  ret void
+}
+
+define void @callee_memmove(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @callee_memmove
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    tail call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false)
+  %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  ret void
+}
+
+define void @callee_mempcpy(ptr %dst, ptr %src, i64 %size) {
+; CHECK-LABEL: define void @callee_mempcpy
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false)
+  %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize)
+  ret void
+}
+
+define void @callee_memset(ptr %dst, i64 %size) {
+; CHECK-LABEL: define void @callee_memset
+; CHECK-SAME: (ptr [[DST:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 1 [[DST]], i8 0, i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false)
+  %call.mempcpy = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 %objsize)
+  ret void
+}
+
+define void @caller_memcpy(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @caller_memcpy
+; CHECK-SAME: (ptr [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    tail call void @callee_memcpy(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @callee_memcpy(ptr %dst, ptr %src, i64 4)
+  ret void
+}
+
+define void @caller_memmove(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @caller_memmove
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    tail call void @callee_memmove(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @callee_memmove(ptr %dst, ptr %src, i64 4)
+  ret void
+}
+
+define void @caller_mempcpy(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @caller_mempcpy
+; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    tail call void @callee_mempcpy(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @callee_mempcpy(ptr %dst, ptr %src, i64 4)
+  ret void
+}
+
+define void @caller_memset(ptr %dst) {
+; CHECK-LABEL: define void @caller_memset
+; CHECK-SAME: (ptr [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    tail call void @callee_memset(ptr [[DST]], i64 4)
+; CHECK-NEXT:    ret void
+;
+  call void @callee_memset(ptr %dst, i64 4)
+  ret void
+}

>From fe92cd62e5ef63a4337c5abfff6b964ac415e660 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Wed, 27 Nov 2024 18:10:17 +0000
Subject: [PATCH 2/3] [Inliner] Plumb a TargetLibraryInfo through to
 CallAnalyzer. NFC

---
 llvm/include/llvm/Analysis/InlineCost.h |  2 ++
 llvm/lib/Analysis/InlineCost.cpp        | 45 ++++++++++++++++---------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index 22e0b1bc901a43..ed54b0c077b4a4 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -318,6 +318,7 @@ std::optional<int> getInliningCostEstimate(
     CallBase &Call, TargetTransformInfo &CalleeTTI,
     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
     ProfileSummaryInfo *PSI = nullptr,
     OptimizationRemarkEmitter *ORE = nullptr);
 
@@ -327,6 +328,7 @@ std::optional<InlineCostFeatures> getInliningCostFeatures(
     CallBase &Call, TargetTransformInfo &CalleeTTI,
     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
     ProfileSummaryInfo *PSI = nullptr,
     OptimizationRemarkEmitter *ORE = nullptr);
 
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 22bb406c01a4ed..fcb1ca1be0952e 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -249,6 +249,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// Getter for BlockFrequencyInfo
   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
 
+  /// Getter for TargetLibraryInfo
+  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
+
   /// Profile summary information.
   ProfileSummaryInfo *PSI;
 
@@ -492,13 +495,15 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitUnreachableInst(UnreachableInst &I);
 
 public:
-  CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
-               function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
-               function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
-               ProfileSummaryInfo *PSI = nullptr,
-               OptimizationRemarkEmitter *ORE = nullptr)
+  CallAnalyzer(
+      Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
+      function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+      function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+      function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
+      ProfileSummaryInfo *PSI = nullptr,
+      OptimizationRemarkEmitter *ORE = nullptr)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
-        PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
+        GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
         CandidateCall(Call) {}
 
   InlineResult analyze();
@@ -688,7 +693,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
       /// to instantiate the derived class.
       InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
-                                GetAssumptionCache, GetBFI, PSI, ORE, false);
+                                GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
+                                false);
       if (CA.analyze().isSuccess()) {
         // We were able to inline the indirect call! Subtract the cost from the
         // threshold to get the bonus we want to apply, but don't go below zero.
@@ -1106,10 +1112,12 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       const TargetTransformInfo &TTI,
       function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
       function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+      function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
       ProfileSummaryInfo *PSI = nullptr,
       OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
       bool IgnoreThreshold = false)
-      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
+      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
+                     ORE),
         ComputeFullInlineCost(OptComputeFullInlineCost ||
                               Params.ComputeFullInlineCost || ORE ||
                               isCostBenefitAnalysisEnabled()),
@@ -1228,8 +1236,8 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
           InlineConstants::IndirectCallThreshold;
 
       InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
-                                GetAssumptionCache, GetBFI, PSI, ORE, false,
-                                true);
+                                GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
+                                false, true);
       if (CA.analyze().isSuccess()) {
         increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
                   CA.getCost());
@@ -1355,9 +1363,11 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
       const TargetTransformInfo &TTI,
       function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
       function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+      function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
       CallBase &Call)
-      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
+      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
+                     PSI) {}
 
   const InlineCostFeatures &features() const { return Cost; }
 };
@@ -2945,6 +2955,7 @@ std::optional<int> llvm::getInliningCostEstimate(
     CallBase &Call, TargetTransformInfo &CalleeTTI,
     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   const InlineParams Params = {/* DefaultThreshold*/ 0,
                                /*HintThreshold*/ {},
@@ -2958,7 +2969,7 @@ std::optional<int> llvm::getInliningCostEstimate(
                                /*EnableDeferral*/ true};
 
   InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
-                            GetAssumptionCache, GetBFI, PSI, ORE, true,
+                            GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
                             /*IgnoreThreshold*/ true);
   auto R = CA.analyze();
   if (!R.isSuccess())
@@ -2970,9 +2981,10 @@ std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
     CallBase &Call, TargetTransformInfo &CalleeTTI,
     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
-  InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
-                                 ORE, *Call.getCalledFunction(), Call);
+  InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
+                                 PSI, ORE, *Call.getCalledFunction(), Call);
   auto R = CFA.analyze();
   if (!R.isSuccess())
     return std::nullopt;
@@ -3072,7 +3084,7 @@ InlineCost llvm::getInlineCost(
                           << ")\n");
 
   InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
-                            GetAssumptionCache, GetBFI, PSI, ORE);
+                            GetAssumptionCache, GetBFI, GetTLI, PSI, ORE);
   InlineResult ShouldInline = CA.analyze();
 
   LLVM_DEBUG(CA.dump());
@@ -3263,7 +3275,8 @@ InlineCostAnnotationPrinterPass::run(Function &F,
           continue;
         OptimizationRemarkEmitter ORE(CalledFunction);
         InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
-                                    GetAssumptionCache, nullptr, &PSI, &ORE);
+                                    GetAssumptionCache, nullptr, nullptr, &PSI,
+                                    &ORE);
         ICCA.analyze();
         OS << "      Analyzing call of " << CalledFunction->getName()
            << "... (caller:" << CB->getCaller()->getName() << ")\n";

>From d2301f93bb8192518fca5e24c0bafa1f4ccae2ca Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Wed, 27 Nov 2024 18:14:19 +0000
Subject: [PATCH 3/3] [Inliner] Don't count a call penalty for foldable
 __memcpy_chk

When the copy length is known to fit within the object size, calls to
__memcpy_chk will eventually be replaced by inline stores. Therefore this
patch avoids counting these as calls for purposes of inlining costs.

This is only really relevant on platforms whose headers redirect memcpy to
__memcpy_chk (such as Darwin). On platforms that use intrinsics, memcpy
and similar functions are already exempt from call penalties.
---
 llvm/lib/Analysis/InlineCost.cpp              | 41 ++++++++++++++++++-
 .../Inline/AArch64/memcpy-constant-size.ll    |  6 ++-
 .../AArch64/memcpy-constant-size.ll           | 11 +++--
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index fcb1ca1be0952e..32acf23e1d0d0d 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -436,6 +436,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool simplifyIntrinsicCallIsConstant(CallBase &CB);
   bool simplifyIntrinsicCallObjectSize(CallBase &CB);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
+  bool isLoweredToCall(Function *F, CallBase &Call);
 
   /// Return true if the given argument to the function being considered for
   /// inlining has the given attribute set either at the call site or the
@@ -2270,6 +2271,44 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
   return false;
 }
 
+bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
+  const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
+  LibFunc LF;
+  if (!TLI || !TLI->getLibFunc(*F, LF) || !TLI->has(LF))
+    return TTI.isLoweredToCall(F);
+
+  switch (LF) {
+  case LibFunc_memcpy_chk:
+  case LibFunc_memmove_chk:
+  case LibFunc_mempcpy_chk:
+  case LibFunc_memset_chk: {
+    // Calls to  __memcpy_chk whose length is known to fit within the object
+    // size will eventually be replaced by inline stores. Therefore, these
+    // should not incur a call penalty. This is only really relevant on
+    // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
+    // other platforms use memcpy intrinsics, which are already exempt from the
+    // call penalty.
+    auto *LenOp = dyn_cast<ConstantInt>(Call.getOperand(2));
+    if (!LenOp)
+      LenOp = dyn_cast_or_null<ConstantInt>(
+          SimplifiedValues.lookup(Call.getOperand(2)));
+    auto *ObjSizeOp = dyn_cast<ConstantInt>(Call.getOperand(3));
+    if (!ObjSizeOp)
+      ObjSizeOp = dyn_cast_or_null<ConstantInt>(
+          SimplifiedValues.lookup(Call.getOperand(3)));
+    if (LenOp && ObjSizeOp &&
+        LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return TTI.isLoweredToCall(F);
+}
+
 bool CallAnalyzer::visitCallBase(CallBase &Call) {
   if (!onCallBaseVisitStart(Call))
     return true;
@@ -2351,7 +2390,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
       return false;
   }
 
-  if (TTI.isLoweredToCall(F)) {
+  if (isLoweredToCall(F, Call)) {
     onLoweredCall(F, Call, IsIndirectCall);
   }
 
diff --git a/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
index b1e9cc2e928f2b..17f7024ff8905d 100644
--- a/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll
@@ -28,7 +28,11 @@ define void @callee(ptr %dst, ptr %src, i64 %size) {
 define void @caller(ptr %dst, ptr %src) {
 ; CHECK-LABEL: define void @caller
 ; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
-; CHECK-NEXT:    call void @callee(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    [[OBJSIZE_I:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[DST]], i1 false, i1 true, i1 false)
+; CHECK-NEXT:    [[CALL_MEMCPY_I:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]])
+; CHECK-NEXT:    [[CALL_MEMMOVE_I:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]])
+; CHECK-NEXT:    [[CALL_MEMPCPY_I:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]])
+; CHECK-NEXT:    [[CALL_MEMSET_I:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 4, i64 [[OBJSIZE_I]])
 ; CHECK-NEXT:    ret void
 ;
   call void @callee(ptr %dst, ptr %src, i64 4)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
index 8c1e4c5ac80072..10b07ad6e7491d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -54,7 +54,8 @@ define void @callee_memset(ptr %dst, i64 %size) {
 define void @caller_memcpy(ptr %dst, ptr %src) {
 ; CHECK-LABEL: define void @caller_memcpy
 ; CHECK-SAME: (ptr [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    tail call void @callee_memcpy(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    ret void
 ;
   call void @callee_memcpy(ptr %dst, ptr %src, i64 4)
@@ -64,7 +65,8 @@ define void @caller_memcpy(ptr %dst, ptr %src) {
 define void @caller_memmove(ptr %dst, ptr %src) {
 ; CHECK-LABEL: define void @caller_memmove
 ; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CHECK-NEXT:    tail call void @callee_memmove(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    ret void
 ;
   call void @callee_memmove(ptr %dst, ptr %src, i64 4)
@@ -74,7 +76,8 @@ define void @caller_memmove(ptr %dst, ptr %src) {
 define void @caller_mempcpy(ptr %dst, ptr %src) {
 ; CHECK-LABEL: define void @caller_mempcpy
 ; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; CHECK-NEXT:    tail call void @callee_mempcpy(ptr [[DST]], ptr [[SRC]], i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    ret void
 ;
   call void @callee_mempcpy(ptr %dst, ptr %src, i64 4)
@@ -84,7 +87,7 @@ define void @caller_mempcpy(ptr %dst, ptr %src) {
 define void @caller_memset(ptr %dst) {
 ; CHECK-LABEL: define void @caller_memset
 ; CHECK-SAME: (ptr [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    tail call void @callee_memset(ptr [[DST]], i64 4)
+; CHECK-NEXT:    store i32 0, ptr [[DST]], align 1
 ; CHECK-NEXT:    ret void
 ;
   call void @callee_memset(ptr %dst, i64 4)



More information about the llvm-commits mailing list