[llvm] r372232 - [SampleFDO] Minimize performance impact when profile-sample-accurate

Wei Mi via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 18 09:06:28 PDT 2019


Author: wmi
Date: Wed Sep 18 09:06:28 2019
New Revision: 372232

URL: http://llvm.org/viewvc/llvm-project?rev=372232&view=rev
Log:
[SampleFDO] Minimize performance impact when profile-sample-accurate
is enabled.

We can save memory and reduce binary size significantly by enabling
ProfileSampleAccurate. However when ProfileSampleAccurate is true,
function without sample will be regarded as cold and this could
potentially cause performance regression.

To minimize the potential negative performance impact, we want to be
a little conservative here saying if a function shows up in the profile,
no matter as outline instance, inline instance or call targets, treat
the function as not being cold. This will handle the cases such as most
callsites of a function are inlined in sampled binary (thus outline copy
don't get any sample) but not inlined in current build (because of source
code drift, imprecise debug information, or the callsites are all cold
individually but not cold accumulatively...), so that the outline function
showing up as cold in sampled binary will actually not be cold after current
build. After the change, such function will be treated as not cold even
profile-sample-accurate is enabled.

At the same time we lower the hot criteria of callsiteIsHot check when
profile-sample-accurate is enabled. callsiteIsHot is used to determined
whether a callsite is hot and qualified for early inlining. When
profile-sample-accurate is enabled, functions without profile will be
regarded as cold and much less inlining will happen in CGSCC inlining pass,
so we can worry less about size increase and be aggressive to allow more
early inlining to happen for warm callsites and it is helpful for performance
overall.

Differential Revision: https://reviews.llvm.org/D67561

Added:
    llvm/trunk/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo   (with props)
    llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll
Modified:
    llvm/trunk/include/llvm/ProfileData/SampleProfReader.h
    llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp

Modified: llvm/trunk/include/llvm/ProfileData/SampleProfReader.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/ProfileData/SampleProfReader.h?rev=372232&r1=372231&r2=372232&view=diff
==============================================================================
--- llvm/trunk/include/llvm/ProfileData/SampleProfReader.h (original)
+++ llvm/trunk/include/llvm/ProfileData/SampleProfReader.h Wed Sep 18 09:06:28 2019
@@ -330,6 +330,10 @@ public:
     return nullptr;
   };
 
+  /// It includes all the names that have samples either in outline instance
+  /// or inline instance.
+  virtual std::vector<StringRef> *getNameTable() { return nullptr; }
+
 protected:
   /// Map every function to its associated profile.
   ///
@@ -387,6 +391,10 @@ public:
   /// Read sample profiles from the associated file.
   std::error_code read() override;
 
+  /// It includes all the names that have samples either in outline instance
+  /// or inline instance.
+  virtual std::vector<StringRef> *getNameTable() override { return &NameTable; }
+
 protected:
   /// Read a numeric value of type T from the profile.
   ///

Modified: llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp?rev=372232&r1=372231&r2=372232&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp Wed Sep 18 09:06:28 2019
@@ -139,9 +139,11 @@ using EdgeWeightMap = DenseMap<Edge, uin
 using BlockEdgeMap =
     DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
 
+class SampleProfileLoader;
+
 class SampleCoverageTracker {
 public:
-  SampleCoverageTracker() = default;
+  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
 
   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
                        uint32_t Discriminator, uint64_t Samples);
@@ -187,6 +189,8 @@ private:
   /// keyed by FunctionSamples pointers, but these stats are cleared after
   /// every function, so we just need to keep a single counter.
   uint64_t TotalUsedSamples = 0;
+
+  SampleProfileLoader &SPLoader;
 };
 
 class GUIDToFuncNameMapper {
@@ -269,8 +273,9 @@ public:
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
       : GetAC(std::move(GetAssumptionCache)),
-        GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
-        RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
+        GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
+        Filename(Name), RemappingFilename(RemapName),
+        IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -279,6 +284,8 @@ public:
   void dump() { Reader->dump(); }
 
 protected:
+  friend class SampleCoverageTracker;
+
   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
@@ -307,6 +314,8 @@ protected:
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
   void clearFunctionData();
+  bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+                     ProfileSummaryInfo *PSI);
 
   /// Map basic blocks to their computed weights.
   ///
@@ -404,6 +413,13 @@ protected:
   // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
   // all the function symbols defined or declared in current module.
   DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+  // All the Names used in FunctionSamples including outline function
+  // names, inline instance names and call target names.
+  StringSet<> NamesInProfile;
+
+  // Showing whether ProfileSampleAccurate is enabled for current function.
+  bool ProfSampleAccEnabled = false;
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -459,14 +475,22 @@ private:
 /// To decide whether an inlined callsite is hot, we compare the callsite
 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
 /// regarded as hot if the count is above the cutoff value.
-static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
-                          ProfileSummaryInfo *PSI) {
+///
+/// When profile-sample-accurate is enabled, functions without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+                                        ProfileSummaryInfo *PSI) {
   if (!CallsiteFS)
     return false; // The callsite was not inlined in the original binary.
 
   assert(PSI && "PSI is expected to be non null");
   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
-  return PSI->isHotCount(CallsiteTotalSamples);
+  if (ProfSampleAccEnabled)
+    return !PSI->isColdCount(CallsiteTotalSamples);
+  else
+    return PSI->isHotCount(CallsiteTotalSamples);
 }
 
 /// Mark as used the sample record for the given function samples at
@@ -503,7 +527,7 @@ SampleCoverageTracker::countUsedRecords(
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countUsedRecords(CalleeSamples, PSI);
     }
 
@@ -522,7 +546,7 @@ SampleCoverageTracker::countBodyRecords(
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countBodyRecords(CalleeSamples, PSI);
     }
 
@@ -543,7 +567,7 @@ SampleCoverageTracker::countBodySamples(
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Total += countBodySamples(CalleeSamples, PSI);
     }
 
@@ -1643,6 +1667,12 @@ bool SampleProfileLoader::doInitializati
   ProfileIsValid = (Reader->read() == sampleprof_error::success);
   PSL = Reader->getProfileSymbolList();
 
+  if (ProfileSampleAccurate) {
+    NamesInProfile.clear();
+    if (auto NameTable = Reader->getNameTable())
+      NamesInProfile.insert(NameTable->begin(), NameTable->end());
+  }
+
   if (!RemappingFilename.empty()) {
     // Apply profile remappings to the loaded profile data if requested.
     // For now, we only support remapping symbols encoded using the Itanium
@@ -1733,17 +1763,36 @@ bool SampleProfileLoader::runOnFunction(
   // conservatively by getEntryCount as the same as unknown (None). This is
   // to avoid newly added code to be treated as cold. If we have samples
   // this will be overwritten in emitAnnotations.
-  //
-  // PSL -- profile symbol list include all the symbols in sampled binary.
-  // If ProfileSampleAccurate is true or F has profile-sample-accurate
-  // attribute, and if there is no profile symbol list read in, initialize
-  // all the function entry counts to 0; if there is profile symbol list, only
-  // initialize the entry count to 0 when current function is in the list.
-  uint64_t initialEntryCount =
-      ((ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) &&
-       (!PSL || PSL->contains(F.getName())))
-          ? 0
-          : -1;
+  uint64_t initialEntryCount = -1;
+
+  ProfSampleAccEnabled =
+      ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate");
+  if (ProfSampleAccEnabled) {
+    // PSL -- profile symbol list include all the symbols in sampled binary.
+    // It is used to prevent new functions to be treated as cold.
+    // If ProfileSampleAccurate is true or F has profile-sample-accurate
+    // attribute, and if there is no profile symbol list read in, initialize
+    // all the function entry counts to 0; if there is profile symbol list, only
+    // initialize the entry count to 0 when current function is in the list.
+    if (!PSL || PSL->contains(F.getName()))
+      initialEntryCount = 0;
+
+    // When ProfileSampleAccurate is true, function without sample will be
+    // regarded as cold. To minimize the potential negative performance
+    // impact it could have, we want to be a little conservative here
+    // saying if a function shows up in the profile, no matter as outline
+    // function, inline instance or call targets, treat the function as not
+    // being cold. This will handle the cases such as most callsites of a
+    // function are inlined in sampled binary but not inlined in current
+    // build (because of source code drift, imprecise debug information, or
+    // the callsites are all cold individually but not cold accumulatively...),
+    // so the outline function showing up as cold in sampled binary will
+    // actually not be cold after current build.
+    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+    if (NamesInProfile.count(CanonName))
+      initialEntryCount = -1;
+  }
+
   F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {

Added: llvm/trunk/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo?rev=372232&view=auto
==============================================================================
Binary file - no diff available.

Propchange: llvm/trunk/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll?rev=372232&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll (added)
+++ llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll Wed Sep 18 09:06:28 2019
@@ -0,0 +1,118 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s
+; With the hot cutoff being set to 600000, the inline instance of _Z3sumii
+; in main is neither hot nor cold. Check it will still be inlined when
+; profile-sample-accurate is enabled, and check _Z3sumii's function entry
+; count won't be initialized to 0 because it shows up in the profile as
+; inline instance.
+
+; Original C++ test case
+;
+; #include <stdio.h>
+;
+; int sum(int x, int y) {
+;   return x + y;
+; }
+;
+; int main() {
+;   int s, i = 0;
+;   while (i++ < 20000 * 20000)
+;     if (i != 100) s = sum(i, s); else s = 30;
+;   printf("sum is %d\n", s);
+;   return 0;
+; }
+;
+ at .str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+
+; CHECK: define i32 @_Z3sumii{{.*}}!prof ![[UNKNOWN_ID:[0-9]+]]
+; Function Attrs: nounwind uwtable
+define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !4 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4, !dbg !11
+  %1 = load i32, i32* %y.addr, align 4, !dbg !11
+  %add = add nsw i32 %0, %1, !dbg !11
+  ret i32 %add, !dbg !11
+}
+
+; Function Attrs: uwtable
+define i32 @main() !dbg !7 {
+entry:
+  %retval = alloca i32, align 4
+  %s = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4, !dbg !12
+  br label %while.cond, !dbg !13
+
+while.cond:                                       ; preds = %if.end, %entry
+  %0 = load i32, i32* %i, align 4, !dbg !14
+  %inc = add nsw i32 %0, 1, !dbg !14
+  store i32 %inc, i32* %i, align 4, !dbg !14
+  %cmp = icmp slt i32 %0, 400000000, !dbg !14
+  br i1 %cmp, label %while.body, label %while.end, !dbg !14
+
+while.body:                                       ; preds = %while.cond
+  %1 = load i32, i32* %i, align 4, !dbg !16
+  %cmp1 = icmp ne i32 %1, 100, !dbg !16
+  br i1 %cmp1, label %if.then, label %if.else, !dbg !16
+
+; Check _Z3sumii is inlined at this callsite.
+; CHECK: if.then:
+; CHECK-NOT: call i32 @_Z3sumii
+; CHECK: if.else:
+if.then:                                          ; preds = %while.body
+  %2 = load i32, i32* %i, align 4, !dbg !18
+  %3 = load i32, i32* %s, align 4, !dbg !18
+  %call = call i32 @_Z3sumii(i32 %2, i32 %3), !dbg !18
+  store i32 %call, i32* %s, align 4, !dbg !18
+  br label %if.end, !dbg !18
+
+if.else:                                          ; preds = %while.body
+  store i32 30, i32* %s, align 4, !dbg !20
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %while.cond, !dbg !22
+
+while.end:                                        ; preds = %while.cond
+  %4 = load i32, i32* %s, align 4, !dbg !24
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %4), !dbg !24
+  ret i32 0, !dbg !25
+}
+
+declare i32 @printf(i8*, ...) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+; CHECK: ![[UNKNOWN_ID]] = !{!"function_entry_count", i64 -1}
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "sum", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "calls.cc", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.5 "}
+!11 = !DILocation(line: 4, scope: !4)
+!12 = !DILocation(line: 8, scope: !7)
+!13 = !DILocation(line: 9, scope: !7)
+!14 = !DILocation(line: 9, scope: !15)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
+!16 = !DILocation(line: 10, scope: !17)
+!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
+!18 = !DILocation(line: 10, scope: !19)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!20 = !DILocation(line: 10, scope: !21)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
+!22 = !DILocation(line: 10, scope: !23)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
+!24 = !DILocation(line: 11, scope: !7)
+!25 = !DILocation(line: 12, scope: !7)




More information about the llvm-commits mailing list