[llvm] [PGO] Sampled instrumentation in PGO to speed up instrumentation binary (PR #69535)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 18 10:06:09 PDT 2024


https://github.com/xur-llvm updated https://github.com/llvm/llvm-project/pull/69535

>From 451d818b0d93e5f624ac6e75120bb3ead87eedc4 Mon Sep 17 00:00:00 2001
From: Rong Xu <xur at google.com>
Date: Wed, 18 Oct 2023 15:13:15 -0700
Subject: [PATCH 1/5] Sampled instrumentation in PGO to speed up
 instrumentation binary

PGO instrumentation binary can be very slow comparing to the
non-instrumented binary. It's not uncommon to see 10x slowdown
for highly threaded programs, due to data race of false sharing
to the counters.

This patch uses sampling in PGO instrumentation to speed up
instrumentation binary. The basic idea is the same as one:
here: https://reviews.llvm.org/D63949

This patch makes some improvements so that we only use one
condition. We now fix the WholeDuring at 65536 and use the
wraparound of unsigned short.

With this sampled instrumentation, the binary runs much
faster. We measure 5x speedup using the default duration.
We now only see about 20% to 30% slow down (comparing to
8 to 10x slowdown without sampling).

The profile quality is pretty good with sampling: the edge
counts usually report >90% overlap.

For the apps that program behaviors change due to binary
speed, sampling instrumentation can improve the performance.
We have observed some apps getting up ~2% improvement in PGO.

One potential issue of this patch is the increased binary
size and compilation time.
---
 .../llvm/ProfileData/InstrProfData.inc        |   1 +
 .../include/llvm/Transforms/Instrumentation.h |   6 +
 .../Instrumentation/PGOInstrumentation.h      |   6 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  10 +-
 .../Instrumentation/InstrProfiling.cpp        | 151 +++++++++++++++---
 .../Instrumentation/PGOInstrumentation.cpp    |   2 +
 .../PGOProfile/Inputs/cspgo_bar_sample.ll     |  82 ++++++++++
 .../PGOProfile/counter_promo_sampling.ll      |  78 +++++++++
 .../Transforms/PGOProfile/cspgo_sample.ll     | 112 +++++++++++++
 .../Transforms/PGOProfile/instrprof_sample.ll |  47 ++++++
 10 files changed, 470 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/cspgo_sample.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/instrprof_sample.ll

diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index e9866d94b762c..847e53cfa7432 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -739,6 +739,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
+#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index ea97ab2562a5b..969c2cd12f3f0 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -121,12 +121,18 @@ struct InstrProfOptions {
   // Use BFI to guide register promotion
   bool UseBFIInPromotion = false;
 
+  // Use sampling to reduce the profile instrumentation runtime overhead.
+  bool Sampling = false;
+
   // Name of the profile file to use as output
   std::string InstrProfileOutput;
 
   InstrProfOptions() = default;
 };
 
+// Create the variable for profile sampling.
+void createProfileSamplingVar(Module &M);
+
 // Options for sanitizer coverage instrumentation.
 struct SanitizerCoverageOptions {
   enum Type {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index 5b1977b7de9a2..7199f27dbc991 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -43,12 +43,14 @@ class FileSystem;
 class PGOInstrumentationGenCreateVar
     : public PassInfoMixin<PGOInstrumentationGenCreateVar> {
 public:
-  PGOInstrumentationGenCreateVar(std::string CSInstrName = "")
-      : CSInstrName(CSInstrName) {}
+  PGOInstrumentationGenCreateVar(std::string CSInstrName = "",
+                                 bool Sampling = false)
+      : CSInstrName(CSInstrName), ProfileSampling(Sampling) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 
 private:
   std::string CSInstrName;
+  bool ProfileSampling;
 };
 
 /// The instrumentation (profile-instr-gen) pass for IR based PGO.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 4fd5ee1946bb7..4dc7cd7387657 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -296,6 +296,9 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
+static cl::opt<bool> EnableSampledInstr(
+    "enable-sampled-instr", cl::init(false), cl::Hidden,
+    cl::desc("Enable profile instrumentation sampling (default = off)"));
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
@@ -847,6 +850,10 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   // Do counter promotion at Level greater than O0.
   Options.DoCounterPromotion = true;
   Options.UseBFIInPromotion = IsCS;
+  if (EnableSampledInstr) {
+    Options.Sampling = true;
+    Options.DoCounterPromotion = false;
+  }
   Options.Atomic = AtomicCounterUpdate;
   MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
 }
@@ -1185,7 +1192,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     MPM.addPass(PGOIndirectCallPromotion(false, false));
 
   if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
-    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
+    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
+                                               EnableSampledInstr));
 
   if (IsMemprofUse)
     MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f994f8a62c320..cbcc3939206ab 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -170,6 +170,17 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
+static cl::opt<bool>
+    SampledInstrument("sampled-instr", cl::ZeroOrMore, cl::init(false),
+                      cl::desc("Do PGO instrumentation sampling"));
+
+static cl::opt<unsigned> SampledInstrumentDuration(
+    "sampled-instr-duration",
+    cl::desc("Set the sample rate for profile instrumentation, with a value "
+             "range 0 to 65535. We will record this number of samples for "
+             "every 65536 count updates"),
+    cl::init(200));
+
 using LoadStorePair = std::pair<Instruction *, Instruction *>;
 
 static uint64_t getIntModuleFlagOrZero(const Module &M, StringRef Flag) {
@@ -260,6 +271,9 @@ class InstrLowerer final {
   /// Returns true if profile counter update register promotion is enabled.
   bool isCounterPromotionEnabled() const;
 
+  /// Return true if profile sampling is enabled.
+  bool isSamplingEnabled() const;
+
   /// Count the number of instrumented value sites for the function.
   void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
 
@@ -291,6 +305,9 @@ class InstrLowerer final {
   /// acts on.
   Value *getCounterAddress(InstrProfCntrInstBase *I);
 
+  /// Lower the incremental instructions under profile sampling predicates.
+  void doSampling(Instruction *I);
+
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
@@ -635,33 +652,95 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+// Perform instrumentation sampling.
+// We transform:
+//   Increment_Instruction;
+// to:
+//   if (__llvm_profile_sampling__ <= SampleDuration) {
+//     Increment_Instruction;
+//   }
+//   __llvm_profile_sampling__ += 1;
+//
+// "__llvm_profile_sampling__" is a thread-local global shared by all PGO
+// instrumentation variables (value-instrumentation and edge instrumentation).
+// It has a unsigned short type and will wrapper around when overflow.
+//
+// Note that, the code snippet after the transformation can still be
+// counter promoted. But I don't see a reason for that because the
+// counter updated should be sparse. That's the reason we disable
+// counter promotion by default when sampling is enabled.
+// This can be overwritten by the internal option.
+//
+void InstrLowerer::doSampling(Instruction *I) {
+  if (!isSamplingEnabled())
+    return;
+  int SampleDuration = SampledInstrumentDuration.getValue();
+  unsigned WrapToZeroValue = USHRT_MAX + 1;
+  assert(SampleDuration < USHRT_MAX);
+  auto *Int16Ty = Type::getInt16Ty(M.getContext());
+  auto *CountVar =
+      M.getGlobalVariable(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
+  assert(CountVar && "CountVar not set properly");
+  IRBuilder<> CondBuilder(I);
+  auto *LoadCountVar = CondBuilder.CreateLoad(Int16Ty, CountVar);
+  auto *DurationCond = CondBuilder.CreateICmpULE(
+      LoadCountVar, CondBuilder.getInt16(SampleDuration));
+  MDBuilder MDB(I->getContext());
+  MDNode *BranchWeight =
+      MDB.createBranchWeights(SampleDuration, WrapToZeroValue - SampleDuration);
+  Instruction *ThenTerm = SplitBlockAndInsertIfThen(
+      DurationCond, I, /* Unreacheable */ false, BranchWeight);
+  IRBuilder<> IncBuilder(I);
+  auto *NewVal = IncBuilder.CreateAdd(LoadCountVar, IncBuilder.getInt16(1));
+  IncBuilder.CreateStore(NewVal, CountVar);
+  I->moveBefore(ThenTerm);
+}
+
 bool InstrLowerer::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
+  SmallVector<InstrProfInstBase *, 8> InstrProfInsts;
+
   for (BasicBlock &BB : *F) {
     for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
-      if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
-        lowerIncrement(IPIS);
-        MadeChange = true;
-      } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
-        lowerIncrement(IPI);
-        MadeChange = true;
-      } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(&Instr)) {
-        lowerTimestamp(IPC);
-        MadeChange = true;
-      } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(&Instr)) {
-        lowerCover(IPC);
-        MadeChange = true;
-      } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
-        lowerValueProfileInst(IPVP);
-        MadeChange = true;
-      } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
-        IPMP->eraseFromParent();
-        MadeChange = true;
-      } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
-        lowerMCDCTestVectorBitmapUpdate(IPBU);
-        MadeChange = true;
-      }
+      if (auto *IP = dyn_cast<InstrProfInstBase>(&Instr))
+        InstrProfInsts.push_back(IP);
+    }
+  }
+
+  for (auto *Instr : InstrProfInsts) {
+    if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(Instr)) {
+      doSampling(IPIS);
+      lowerIncrement(IPIS);
+      MadeChange = true;
+    } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(Instr)) {
+      doSampling(IPI);
+      lowerIncrement(IPI);
+      MadeChange = true;
+    } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(Instr)) {
+      doSampling(IPC);
+      lowerTimestamp(IPC);
+      MadeChange = true;
+    } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(Instr)) {
+      doSampling(IPC);
+      lowerCover(IPC);
+      MadeChange = true;
+    } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+      doSampling(IPVP);
+      lowerValueProfileInst(IPVP);
+      MadeChange = true;
+    } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(Instr)) {
+      doSampling(IPMP);
+      IPMP->eraseFromParent();
+      MadeChange = true;
+    } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(Instr)) {
+      doSampling(IPBU);
+      lowerMCDCTestVectorBitmapUpdate(IPBU);
+      MadeChange = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Invalid InstroProf intrinsic: " << *Instr << "\n");
+      // ?? Seeing "call void @llvm.memcpy.p0.p0.i64..." here ??
+      // llvm_unreachable("Invalid InstroProf intrinsic");
     }
   }
 
@@ -684,6 +763,12 @@ bool InstrLowerer::isRuntimeCounterRelocationEnabled() const {
   return TT.isOSFuchsia();
 }
 
+bool InstrLowerer::isSamplingEnabled() const {
+  if (SampledInstrument.getNumOccurrences() > 0)
+    return SampledInstrument;
+  return Options.Sampling;
+}
+
 bool InstrLowerer::isCounterPromotionEnabled() const {
   if (DoCounterPromotion.getNumOccurrences() > 0)
     return DoCounterPromotion;
@@ -754,6 +839,9 @@ bool InstrLowerer::lower() {
   if (NeedsRuntimeHook)
     MadeChange = emitRuntimeHook();
 
+  if (!IsCS && isSamplingEnabled())
+    createProfileSamplingVar(M);
+
   bool ContainsProfiling = containsProfilingIntrinsics(M);
   GlobalVariable *CoverageNamesVar =
       M.getNamedGlobal(getCoverageUnusedNamesVarName());
@@ -1952,3 +2040,22 @@ void InstrLowerer::emitInitialization() {
 
   appendToGlobalCtors(M, F, 0);
 }
+
+namespace llvm {
+// Create the variable for profile sampling.
+void createProfileSamplingVar(Module &M) {
+  const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
+  Type *IntTy16 = Type::getInt16Ty(M.getContext());
+  auto SamplingVar = new GlobalVariable(
+      M, IntTy16, false, GlobalValue::WeakAnyLinkage,
+      Constant::getIntegerValue(IntTy16, APInt(16, 0)), VarName);
+  SamplingVar->setVisibility(GlobalValue::DefaultVisibility);
+  SamplingVar->setThreadLocal(true);
+  Triple TT(M.getTargetTriple());
+  if (TT.supportsCOMDAT()) {
+    SamplingVar->setLinkage(GlobalValue::ExternalLinkage);
+    SamplingVar->setComdat(M.getOrInsertComdat(VarName));
+  }
+  appendToCompilerUsed(M, SamplingVar);
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 35b1bbf21be97..4f0ccc8f962db 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1875,6 +1875,8 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &MAM) {
   // The variable in a comdat may be discarded by LTO. Ensure the declaration
   // will be retained.
   appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true));
+  if (ProfileSampling)
+    createProfileSamplingVar(M);
   PreservedAnalyses PA;
   PA.preserve<FunctionAnalysisManagerModuleProxy>();
   PA.preserveSet<AllAnalysesOn<Function>>();
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll b/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
new file mode 100644
index 0000000000000..1c8be82715f25
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
@@ -0,0 +1,82 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+$__llvm_profile_raw_version = comdat any
+$__llvm_profile_sampling = comdat any
+
+ at odd = common dso_local local_unnamed_addr global i32 0, align 4
+ at even = common dso_local local_unnamed_addr global i32 0, align 4
+ at __llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
+ at __llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
+ at __llvm_profile_sampling = thread_local global i16 0, comdat
+ at llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
+
+define dso_local void @bar(i32 %n) !prof !30 {
+entry:
+  %call = tail call fastcc i32 @cond(i32 %n)
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.else, label %if.then, !prof !31
+
+if.then:
+  %0 = load i32, i32* @odd, align 4, !tbaa !32
+  %inc = add i32 %0, 1
+  store i32 %inc, i32* @odd, align 4, !tbaa !32
+  br label %if.end
+
+if.else:
+  %1 = load i32, i32* @even, align 4, !tbaa !32
+  %inc1 = add i32 %1, 1
+  store i32 %inc1, i32* @even, align 4, !tbaa !32
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define internal fastcc i32 @cond(i32 %i) #1 !prof !30 !PGOFuncName !36 {
+entry:
+  %rem = srem i32 %i, 2
+  ret i32 %rem
+}
+
+attributes #1 = { inlinehint noinline }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
+!2 = !{i32 1, !"ProfileSummary", !3}
+!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
+!4 = !{!"ProfileFormat", !"InstrProf"}
+!5 = !{!"TotalCount", i64 500002}
+!6 = !{!"MaxCount", i64 200000}
+!7 = !{!"MaxInternalCount", i64 100000}
+!8 = !{!"MaxFunctionCount", i64 200000}
+!9 = !{!"NumCounts", i64 6}
+!10 = !{!"NumFunctions", i64 4}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 200000, i32 1}
+!14 = !{i32 100000, i64 200000, i32 1}
+!15 = !{i32 200000, i64 200000, i32 1}
+!16 = !{i32 300000, i64 200000, i32 1}
+!17 = !{i32 400000, i64 200000, i32 1}
+!18 = !{i32 500000, i64 100000, i32 4}
+!19 = !{i32 600000, i64 100000, i32 4}
+!20 = !{i32 700000, i64 100000, i32 4}
+!21 = !{i32 800000, i64 100000, i32 4}
+!22 = !{i32 900000, i64 100000, i32 4}
+!23 = !{i32 950000, i64 100000, i32 4}
+!24 = !{i32 990000, i64 100000, i32 4}
+!25 = !{i32 999000, i64 100000, i32 4}
+!26 = !{i32 999900, i64 100000, i32 4}
+!27 = !{i32 999990, i64 100000, i32 4}
+!28 = !{i32 999999, i64 1, i32 6}
+!30 = !{!"function_entry_count", i64 200000}
+!31 = !{!"branch_weights", i32 100000, i32 100000}
+!32 = !{!33, !33, i64 0}
+!33 = !{!"int", !34, i64 0}
+!34 = !{!"omnipotent char", !35, i64 0}
+!35 = !{!"Simple C/C++ TBAA"}
+!36 = !{!"cspgo_bar.c:cond"}
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
new file mode 100644
index 0000000000000..6f13196a72499
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -sampled-instr=true -skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
+
+; SAMPLING: $__llvm_profile_sampling = comdat any
+; SAMPLING: @__llvm_profile_sampling = thread_local global i16 0, comdat
+
+define void @foo(i32 %n, i32 %N) {
+; SAMPLING-LABEL: @foo
+; SAMPLING:  %[[VV0:[0-9]+]] = load i16, ptr @__llvm_profile_sampling, align 2
+; SAMPLING:  %[[VV1:[0-9]+]] = icmp ule i16 %[[VV0]], 200
+; SAMPLING:  br i1 %[[VV1]], label {{.*}}, label {{.*}}, !prof !0
+; SAMPLING: {{.*}} = load {{.*}} @__profc_foo{{.*}} 3)
+; SAMPLING-NEXT: add
+; SAMPLING-NEXT: store {{.*}}@__profc_foo{{.*}}3)
+bb:
+  %tmp = add nsw i32 %n, 1
+  %tmp1 = add nsw i32 %n, -1
+  br label %bb2
+
+bb2:
+; PROMO: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+; PROMO-NEXT: phi {{.*}}
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp10, %bb9 ]
+  %tmp3 = icmp slt i32 %i.0, %tmp
+  br i1 %tmp3, label %bb4, label %bb5
+
+bb4:
+  tail call void @bar(i32 1)
+  br label %bb9
+
+bb5:
+  %tmp6 = icmp slt i32 %i.0, %tmp1
+  br i1 %tmp6, label %bb7, label %bb8
+
+bb7:
+  tail call void @bar(i32 2)
+  br label %bb9
+
+bb8:
+  tail call void @bar(i32 3)
+  br label %bb9
+
+bb9:
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V1:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V1]], ptr @__llvm_profile_sampling, align 2
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V2:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V2]], ptr @__llvm_profile_sampling, align 2
+; SAMPLING:       phi {{.*}}
+; SAMPLING-NEXT:  %[[V3:[0-9]+]] = add i16 {{.*}}, 1
+; SAMPLING-NEXT:  store i16 %[[V3]], ptr @__llvm_profile_sampling, align 2
+; PROMO: %[[LIVEOUT3:[a-z0-9]+]] = phi {{.*}}
+; PROMO-NEXT: %[[LIVEOUT2:[a-z0-9]+]] = phi {{.*}}
+; PROMO-NEXT: %[[LIVEOUT1:[a-z0-9]+]] = phi {{.*}}
+  %tmp10 = add nsw i32 %i.0, 1
+  %tmp11 = icmp slt i32 %tmp10, %N
+  br i1 %tmp11, label %bb2, label %bb12
+
+bb12:
+  ret void
+; PROMO: %[[CHECK1:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}}
+; PROMO-NEXT: add {{.*}} %[[CHECK1]], %[[LIVEOUT1]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}
+; PROMO-NEXT: %[[CHECK2:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 1)
+; PROMO-NEXT: add {{.*}} %[[CHECK2]], %[[LIVEOUT2]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}1)
+; PROMO-NEXT: %[[CHECK3:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 2)
+; PROMO-NEXT: add {{.*}} %[[CHECK3]], %[[LIVEOUT3]]
+; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2)
+; PROMO-NOT: @__profc_foo{{.*}})
+
+}
+
+declare void @bar(i32)
+
+; SAMPLING: !0 = !{!"branch_weights", i32 200, i32 65336}
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
new file mode 100644
index 0000000000000..6683cae4e64c1
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %S/Inputs/cspgo_bar_sample.ll -o %t2.bc
+; RUN: llvm-lto2 run -lto-cspgo-profile-file=alloc -enable-sampled-instr -lto-cspgo-gen -save-temps -o %t %t1.bc %t2.bc \
+; RUN:   -r=%t1.bc,foo,pl \
+; RUN:   -r=%t1.bc,bar,l \
+; RUN:   -r=%t1.bc,main,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_filename,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_raw_version,plx \
+; RUN:   -r=%t1.bc,__llvm_profile_sampling,pl \
+; RUN:   -r=%t2.bc,bar,pl \
+; RUN:   -r=%t2.bc,odd,pl \
+; RUN:   -r=%t2.bc,even,pl \
+; RUN:   -r=%t2.bc,__llvm_profile_filename,x \
+; RUN:   -r=%t2.bc,__llvm_profile_raw_version,x \
+; RUN:   -r=%t2.bc,__llvm_profile_sampling,
+; RUN: llvm-dis %t.1.4.opt.bc -o - | FileCheck %s --check-prefix=CSGEN
+
+; CSGEN: @__llvm_profile_sampling = thread_local global i16 0, comdat
+; CSGEN: @__profc_
+; CSGEN: @__profd_
+
+source_filename = "cspgo.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_filename = comdat any
+$__llvm_profile_raw_version = comdat any
+$__llvm_profile_sampling = comdat any
+ at __llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
+ at __llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
+ at __llvm_profile_sampling = thread_local global i16 0, comdat
+ at llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
+
+define dso_local void @foo() #0 !prof !30 {
+entry:
+  br label %for.body
+
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  tail call void @bar(i32 %i.06) #3
+  %add = or i32 %i.06, 1
+  tail call void @bar(i32 %add) #3
+  %add1 = add nuw nsw i32 %i.06, 2
+  %cmp = icmp ult i32 %add1, 200000
+  br i1 %cmp, label %for.body, label %for.end, !prof !31
+
+for.end:
+  ret void
+}
+
+; CSGEN-LABEL: @foo
+; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
+; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
+; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
+; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
+; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
+
+declare dso_local void @bar(i32)
+
+define dso_local i32 @main() !prof !30 {
+entry:
+  tail call void @foo()
+  ret i32 0
+}
+; CSGEN-LABEL: @main
+; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
+; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
+; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
+; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
+; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
+!2 = !{i32 1, !"ProfileSummary", !3}
+!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
+!4 = !{!"ProfileFormat", !"InstrProf"}
+!5 = !{!"TotalCount", i64 500002}
+!6 = !{!"MaxCount", i64 200000}
+!7 = !{!"MaxInternalCount", i64 100000}
+!8 = !{!"MaxFunctionCount", i64 200000}
+!9 = !{!"NumCounts", i64 6}
+!10 = !{!"NumFunctions", i64 4}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 200000, i32 1}
+!14 = !{i32 100000, i64 200000, i32 1}
+!15 = !{i32 200000, i64 200000, i32 1}
+!16 = !{i32 300000, i64 200000, i32 1}
+!17 = !{i32 400000, i64 200000, i32 1}
+!18 = !{i32 500000, i64 100000, i32 4}
+!19 = !{i32 600000, i64 100000, i32 4}
+!20 = !{i32 700000, i64 100000, i32 4}
+!21 = !{i32 800000, i64 100000, i32 4}
+!22 = !{i32 900000, i64 100000, i32 4}
+!23 = !{i32 950000, i64 100000, i32 4}
+!24 = !{i32 990000, i64 100000, i32 4}
+!25 = !{i32 999000, i64 100000, i32 4}
+!26 = !{i32 999900, i64 100000, i32 4}
+!27 = !{i32 999990, i64 100000, i32 4}
+!28 = !{i32 999999, i64 1, i32 6}
+!30 = !{!"function_entry_count", i64 1}
+!31 = !{!"branch_weights", i32 100000, i32 1}
+
+; CSGEN: [[PROF]] = !{!"branch_weights", i32 200, i32 65336}
+
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_sample.ll b/llvm/test/Transforms/PGOProfile/instrprof_sample.ll
new file mode 100644
index 0000000000000..481434cb34450
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_sample.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -passes=instrprof -sampled-instr -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION
+; RRRRUN: opt < %s -passes=instrprof -sampled-instr -sampled-instr-duration=100 -S| FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION-100
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+; SAMPLE-VAR: $__llvm_profile_sampling = comdat any
+
+ at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
+ at __profn_f = private constant [1 x i8] c"f"
+
+; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
+; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
+; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
+; SAMPLE-VAR: @__llvm_prf_nm = private constant [11 x i8] c"\01\09x\DAK\03\00\00g\00g", section "__llvm_prf_names", align 1
+; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
+; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
+
+
+define void @f() {
+; SAMPLE-CODE-LABEL: @f(
+; SAMPLE-CODE:  entry:
+; SAMPLE-CODE-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; SAMPLE-DURATION:         [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 200
+; SAMPLE-DURATION-100:     [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 100
+; SAMPLE-CODE:         br i1 [[TMP1]], label %[[TMP2:.*]], label %[[TMP4:.*]], !prof !0
+; SAMPLE-CODE:       [[TMP2]]:
+; SAMPLE-CODE-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f
+; SAMPLE-CODE-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; SAMPLE-CODE-NEXT:    store i64 [[TMP3]], ptr @__profc_f
+; SAMPLE-CODE-NEXT:    br label %[[TMP4]]
+; SAMPLE-CODE:       [[TMP4]]:
+; SAMPLE-CODE-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
+; SAMPLE-CODE-NEXT:    store i16 [[TMP5]],  ptr @__llvm_profile_sampling, align 2
+; SAMPLE-CODE-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+; SAMPLE-DURATION: !0 = !{!"branch_weights", i32 200, i32 65336}
+; SAMPLE-DURATION-100: !0 = !{!"branch_weights", i32 100, i32 65436}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)

>From 64cbd8f09d7a5b8e619c94871c11dba41cdfca1f Mon Sep 17 00:00:00 2001
From: Rong Xu <xur at google.com>
Date: Sun, 14 Jul 2024 17:04:27 -0700
Subject: [PATCH 2/5] [PGO] Sampled instrumentation in PGO to speed up
 instrumentation binary

Integrated the reviews from Wenlei, David, Snehasish and Hongtao.
The patch now has 3 modes for sampling:
(1) full burst sampling
(2) fast burst sampling
(3) simple sampling
Also update the tests.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   2 +
 .../Instrumentation/InstrProfiling.cpp        | 189 +++++++++++++-----
 .../PGOProfile/counter_promo_sampling.ll      |   2 +-
 ...ple.ll => instrprof_bust_sampling_fast.ll} |  10 +-
 .../instrprof_bust_sampling_full.ll           |  45 +++++
 .../instrprof_bust_sampling_full_intsize.ll   |  45 +++++
 .../PGOProfile/instrprof_simple_sampling.ll   |  39 ++++
 7 files changed, 274 insertions(+), 58 deletions(-)
 rename llvm/test/Transforms/PGOProfile/{instrprof_sample.ll => instrprof_bust_sampling_fast.ll} (81%)
 create mode 100644 llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 4dc7cd7387657..ed7a9d09ee5cc 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -852,6 +852,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   Options.UseBFIInPromotion = IsCS;
   if (EnableSampledInstr) {
     Options.Sampling = true;
+    // With sampling, there is little beneifit to enable counter promotion.
+    // But note that sampling does work with counter promotion.
     Options.DoCounterPromotion = false;
   }
   Options.Atomic = AtomicCounterUpdate;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index cbcc3939206ab..994b62e39d3c9 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -170,15 +170,27 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
-static cl::opt<bool>
-    SampledInstrument("sampled-instr", cl::ZeroOrMore, cl::init(false),
-                      cl::desc("Do PGO instrumentation sampling"));
-
-static cl::opt<unsigned> SampledInstrumentDuration(
-    "sampled-instr-duration",
-    cl::desc("Set the sample rate for profile instrumentation, with a value "
-             "range 0 to 65535. We will record this number of samples for "
-             "every 65536 count updates"),
+static cl::opt<bool> SampledInstr("sampled-instr", cl::ZeroOrMore,
+                                  cl::init(false),
+                                  cl::desc("Do PGO instrumentation sampling"));
+
+static cl::opt<unsigned> SampledInstrPeriod(
+    "sampled-instr-period",
+    cl::desc("Set the profile instrumentation sample period. For each sample "
+             "period, the 'sampled-instr-burst-duration' number of consecutive "
+             "samples will be recorded. The default sample period of 65535 is "
+             "optimized for generating efficient code that leverages unsigned "
+             "integer wrapping in overflow."),
+    cl::init(65535));
+
+static cl::opt<unsigned> SampledInstrBurstDuration(
+    "sampled-instr-burst-duration",
+    cl::desc("Set the profile instrumentation burst duration, which can range "
+             "from 0 to one less than the value of 'sampled-instr-period'. "
+             "This number of samples will be recorded for each "
+             "'sampled-instr-period' count update. Setting to 1 enables "
+             "simple sampling, in which case it is recommended to set "
+             "'sampled-instr-period' to a prime number."),
     cl::init(200));
 
 using LoadStorePair = std::pair<Instruction *, Instruction *>;
@@ -652,48 +664,124 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+//
 // Perform instrumentation sampling.
-// We transform:
+//
+// There are 3 favors of sampling:
+// (1) Full burst sampling: We transform:
 //   Increment_Instruction;
 // to:
-//   if (__llvm_profile_sampling__ <= SampleDuration) {
+//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
 //     Increment_Instruction;
 //   }
 //   __llvm_profile_sampling__ += 1;
+//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
+//     __llvm_profile_sampling__ = 0;
+//   }
 //
 // "__llvm_profile_sampling__" is a thread-local global shared by all PGO
-// instrumentation variables (value-instrumentation and edge instrumentation).
-// It has a unsigned short type and will wrapper around when overflow.
+// counters (value-instrumentation and edge instrumentation).
+//
+// (2) Fast burst sampling:
+// The value is an unsigned type, meaning it will wrap around to zero when
+// overflows. In this case, a second check (check2) is unnecessary, so we
+// won't generate check2 when the SampledInstrPeriod is set to 65535 (64K - 1).
+// The code after:
+//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
+//     Increment_Instruction;
+//   }
+//   __llvm_profile_sampling__ += 1;
 //
-// Note that, the code snippet after the transformation can still be
-// counter promoted. But I don't see a reason for that because the
-// counter updated should be sparse. That's the reason we disable
-// counter promotion by default when sampling is enabled.
-// This can be overwritten by the internal option.
+// (3) Simple sampling:
+// When SampledInstrBurstDuration sets to 1, we do a simple sampling:
+//   __llvm_profile_sampling__ += 1;
+//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
+//     __llvm_profile_sampling__ = 0;
+//     Increment_Instruction;
+//   }
 //
+// Note that, the code snippet after the transformation can still be counter
+// promoted. However, with sampling enabled, counter updates are expected to
+// be infrequent, making the benefits of counter promotion negligible.
+// Moreover, counter promotion can potentially cause issues in server
+// applications, particularly when the counters are dumped without a clean
+// exit. To mitigate this risk, counter promotion is disabled by default when
+// sampling is enabled. This behavior can be overridden using the internal
+// option.
 void InstrLowerer::doSampling(Instruction *I) {
   if (!isSamplingEnabled())
     return;
-  int SampleDuration = SampledInstrumentDuration.getValue();
-  unsigned WrapToZeroValue = USHRT_MAX + 1;
-  assert(SampleDuration < USHRT_MAX);
-  auto *Int16Ty = Type::getInt16Ty(M.getContext());
-  auto *CountVar =
+
+  unsigned SampledBurstDuration = SampledInstrBurstDuration.getValue();
+  unsigned SampledPeriod = SampledInstrPeriod.getValue();
+  assert(SampledBurstDuration < SampledPeriod);
+  bool UseShort = (SampledPeriod <= USHRT_MAX);
+  bool IsSimpleSampling = (SampledBurstDuration == 1);
+  bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
+
+  auto GetConstant = [UseShort](IRBuilder<> &Builder, uint32_t C) {
+    if (UseShort)
+      return Builder.getInt16(C);
+    else
+      return Builder.getInt32(C);
+  };
+
+  IntegerType *SamplingVarTy;
+  if (UseShort)
+    SamplingVarTy = Type::getInt16Ty(M.getContext());
+  else
+    SamplingVarTy = Type::getInt32Ty(M.getContext());
+  auto *SamplingVar =
       M.getGlobalVariable(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
-  assert(CountVar && "CountVar not set properly");
-  IRBuilder<> CondBuilder(I);
-  auto *LoadCountVar = CondBuilder.CreateLoad(Int16Ty, CountVar);
-  auto *DurationCond = CondBuilder.CreateICmpULE(
-      LoadCountVar, CondBuilder.getInt16(SampleDuration));
+  assert(SamplingVar && "SamplingVar not set properly");
+
+  // Create the condition for checking the burst duration.
+  Instruction *SamplingVarIncr;
+  Value *NewSamplingVarVal;
   MDBuilder MDB(I->getContext());
-  MDNode *BranchWeight =
-      MDB.createBranchWeights(SampleDuration, WrapToZeroValue - SampleDuration);
-  Instruction *ThenTerm = SplitBlockAndInsertIfThen(
-      DurationCond, I, /* Unreacheable */ false, BranchWeight);
-  IRBuilder<> IncBuilder(I);
-  auto *NewVal = IncBuilder.CreateAdd(LoadCountVar, IncBuilder.getInt16(1));
-  IncBuilder.CreateStore(NewVal, CountVar);
-  I->moveBefore(ThenTerm);
+  MDNode *BranchWeight;
+  IRBuilder<> CondBuilder(I);
+  auto *LoadSamplingVar = CondBuilder.CreateLoad(SamplingVarTy, SamplingVar);
+  if (IsSimpleSampling) {
+    // For the simple sampling, just create the load and increments.
+    IRBuilder<> IncBuilder(I);
+    NewSamplingVarVal =
+        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
+    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
+  } else {
+    // For the bust-sampling, create the conditonal update.
+    auto *DurationCond = CondBuilder.CreateICmpULE(
+        LoadSamplingVar, GetConstant(CondBuilder, SampledBurstDuration));
+    BranchWeight = MDB.createBranchWeights(
+        SampledBurstDuration, SampledPeriod + 1 - SampledBurstDuration);
+    Instruction *ThenTerm = SplitBlockAndInsertIfThen(
+        DurationCond, I, /* Unreachable */ false, BranchWeight);
+    IRBuilder<> IncBuilder(I);
+    NewSamplingVarVal =
+        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
+    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
+    I->moveBefore(ThenTerm);
+  }
+
+  if (IsFastSampling)
+    return;
+
+  // Create the condtion for checking the period.
+  Instruction *ThenTerm, *ElseTerm;
+  IRBuilder<> PeriodCondBuilder(SamplingVarIncr);
+  auto *PeriodCond = PeriodCondBuilder.CreateICmpUGE(
+      NewSamplingVarVal, GetConstant(PeriodCondBuilder, SampledPeriod));
+  BranchWeight = MDB.createBranchWeights(1, SampledPeriod);
+  SplitBlockAndInsertIfThenElse(PeriodCond, SamplingVarIncr, &ThenTerm,
+                                &ElseTerm, BranchWeight);
+
+  // For the simple sampling, the counter update happens in sampling var reset.
+  if (IsSimpleSampling)
+    I->moveBefore(ThenTerm);
+
+  IRBuilder<> ResetBuilder(ThenTerm);
+  ResetBuilder.CreateStore(GetConstant(ResetBuilder, 0), SamplingVar);
+  SamplingVarIncr->moveBefore(ElseTerm);
 }
 
 bool InstrLowerer::lowerIntrinsics(Function *F) {
@@ -709,38 +797,28 @@ bool InstrLowerer::lowerIntrinsics(Function *F) {
   }
 
   for (auto *Instr : InstrProfInsts) {
+    doSampling(Instr);
     if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(Instr)) {
-      doSampling(IPIS);
       lowerIncrement(IPIS);
       MadeChange = true;
     } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(Instr)) {
-      doSampling(IPI);
       lowerIncrement(IPI);
       MadeChange = true;
     } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(Instr)) {
-      doSampling(IPC);
       lowerTimestamp(IPC);
       MadeChange = true;
     } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(Instr)) {
-      doSampling(IPC);
       lowerCover(IPC);
       MadeChange = true;
     } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(Instr)) {
-      doSampling(IPVP);
       lowerValueProfileInst(IPVP);
       MadeChange = true;
     } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(Instr)) {
-      doSampling(IPMP);
       IPMP->eraseFromParent();
       MadeChange = true;
     } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(Instr)) {
-      doSampling(IPBU);
       lowerMCDCTestVectorBitmapUpdate(IPBU);
       MadeChange = true;
-    } else {
-      LLVM_DEBUG(dbgs() << "Invalid InstroProf intrinsic: " << *Instr << "\n");
-      // ?? Seeing "call void @llvm.memcpy.p0.p0.i64..." here ??
-      // llvm_unreachable("Invalid InstroProf intrinsic");
     }
   }
 
@@ -764,8 +842,8 @@ bool InstrLowerer::isRuntimeCounterRelocationEnabled() const {
 }
 
 bool InstrLowerer::isSamplingEnabled() const {
-  if (SampledInstrument.getNumOccurrences() > 0)
-    return SampledInstrument;
+  if (SampledInstr.getNumOccurrences() > 0)
+    return SampledInstr;
   return Options.Sampling;
 }
 
@@ -2045,10 +2123,17 @@ namespace llvm {
 // Create the variable for profile sampling.
 void createProfileSamplingVar(Module &M) {
   const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
-  Type *IntTy16 = Type::getInt16Ty(M.getContext());
+  IntegerType *SamplingVarTy;
+  Constant *ValueZero;
+  if (SampledInstrPeriod.getValue() <= USHRT_MAX) {
+    SamplingVarTy = Type::getInt16Ty(M.getContext());
+    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(16, 0));
+  } else {
+    SamplingVarTy = Type::getInt32Ty(M.getContext());
+    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(32, 0));
+  }
   auto SamplingVar = new GlobalVariable(
-      M, IntTy16, false, GlobalValue::WeakAnyLinkage,
-      Constant::getIntegerValue(IntTy16, APInt(16, 0)), VarName);
+      M, SamplingVarTy, false, GlobalValue::WeakAnyLinkage, ValueZero, VarName);
   SamplingVar->setVisibility(GlobalValue::DefaultVisibility);
   SamplingVar->setThreadLocal(true);
   Triple TT(M.getTargetTriple());
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
index 6f13196a72499..eeef5b8741501 100644
--- a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -sampled-instr=true -skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof --do-counter-promotion=true --sampled-instr=true --skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
 
 ; SAMPLING: $__llvm_profile_sampling = comdat any
 ; SAMPLING: @__llvm_profile_sampling = thread_local global i16 0, comdat
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_sample.ll b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll
similarity index 81%
rename from llvm/test/Transforms/PGOProfile/instrprof_sample.ll
rename to llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll
index 481434cb34450..dfa7cd6d3608d 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_sample.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes=instrprof -sampled-instr -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION
-; RRRRUN: opt < %s -passes=instrprof -sampled-instr -sampled-instr-duration=100 -S| FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION-100
+; RUN: opt < %s --passes=instrprof --sampled-instr -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION,SAMPLE-WEIGHT
+; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-burst-duration=100 -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION100,SAMPLE-WEIGHT100
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -24,7 +24,7 @@ define void @f() {
 ; SAMPLE-CODE:  entry:
 ; SAMPLE-CODE-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
 ; SAMPLE-DURATION:         [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 200
-; SAMPLE-DURATION-100:     [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 100
+; SAMPLE-DURATION100:     [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 100
 ; SAMPLE-CODE:         br i1 [[TMP1]], label %[[TMP2:.*]], label %[[TMP4:.*]], !prof !0
 ; SAMPLE-CODE:       [[TMP2]]:
 ; SAMPLE-CODE-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f
@@ -41,7 +41,7 @@ entry:
   ret void
 }
 
-; SAMPLE-DURATION: !0 = !{!"branch_weights", i32 200, i32 65336}
-; SAMPLE-DURATION-100: !0 = !{!"branch_weights", i32 100, i32 65436}
+; SAMPLE-WEIGHT: !0 = !{!"branch_weights", i32 200, i32 65336}
+; SAMPLE-WEIGHT100: !0 = !{!"branch_weights", i32 100, i32 65436}
 
 declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll
new file mode 100644
index 0000000000000..dd1837d5d0149
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof -sampled-instr --sampled-instr-period=1009 --sampled-instr-burst-duration=32 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+ at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
+ at __profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 32
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i16 [[TMP5]], 1009
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    store i16 [[TMP5]], ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 32, i32 978}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1009}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll
new file mode 100644
index 0000000000000..32b40e023fa81
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-period=1000019 --sampled-instr-burst-duration=3000 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+ at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
+ at __profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[TMP0]], 3000
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i32 [[TMP5]], 1000019
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    store i32 0, ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__llvm_profile_sampling, align 4
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 3000, i32 997020}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1000019}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
new file mode 100644
index 0000000000000..e2dff645fd34c
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__llvm_profile_raw_version = comdat any
+
+ at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
+ at __profn_f = private constant [1 x i8] c"f"
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], 1009
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
+; CHECK-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1009}
+;.

>From c207bfb829f4ea71846ab2b607665c5400331dfb Mon Sep 17 00:00:00 2001
From: Rong Xu <xur at google.com>
Date: Mon, 15 Jul 2024 09:57:14 -0700
Subject: [PATCH 3/5] [PGO] Fix the test for windows build

Fix the test for windows build as the llvm_nm has different size.
Also rename the tests to correct a typo.
---
 ...f_bust_sampling_fast.ll => instrprof_burst_sampling_fast.ll} | 2 +-
 ...f_bust_sampling_full.ll => instrprof_burst_sampling_full.ll} | 0
 ...full_intsize.ll => instrprof_burst_sampling_full_intsize.ll} | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename llvm/test/Transforms/PGOProfile/{instrprof_bust_sampling_fast.ll => instrprof_burst_sampling_fast.ll} (95%)
 rename llvm/test/Transforms/PGOProfile/{instrprof_bust_sampling_full.ll => instrprof_burst_sampling_full.ll} (100%)
 rename llvm/test/Transforms/PGOProfile/{instrprof_bust_sampling_full_intsize.ll => instrprof_burst_sampling_full_intsize.ll} (100%)

diff --git a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
similarity index 95%
rename from llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll
rename to llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
index dfa7cd6d3608d..a21bdb43f57df 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_fast.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
@@ -14,7 +14,7 @@ $__llvm_profile_raw_version = comdat any
 ; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
 ; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
 ; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
-; SAMPLE-VAR: @__llvm_prf_nm = private constant [11 x i8] c"\01\09x\DAK\03\00\00g\00g", section "__llvm_prf_names", align 1
+; SAMPLE-VAR: @__llvm_prf_nm = private constant {{.*}}, section "__llvm_prf_names", align 1
 ; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
 ; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
 
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
similarity index 100%
rename from llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full.ll
rename to llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
similarity index 100%
rename from llvm/test/Transforms/PGOProfile/instrprof_bust_sampling_full_intsize.ll
rename to llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll

>From 9c795d9653a18f7bcf201131c7a4e1ba9fe98031 Mon Sep 17 00:00:00 2001
From: Rong Xu <xur at google.com>
Date: Tue, 16 Jul 2024 16:44:01 -0700
Subject: [PATCH 4/5] Integrate David and Wenlei review commments.

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  2 +-
 .../Instrumentation/InstrProfiling.cpp        | 24 +++++++++++--------
 .../PGOProfile/counter_promo_sampling.ll      |  2 +-
 .../Transforms/PGOProfile/cspgo_sample.ll     |  2 +-
 .../instrprof_burst_sampling_fast.ll          |  4 ++--
 .../instrprof_burst_sampling_full.ll          |  2 +-
 .../instrprof_burst_sampling_full_intsize.ll  |  2 +-
 .../PGOProfile/instrprof_simple_sampling.ll   |  2 +-
 8 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index ed7a9d09ee5cc..935504b070d2e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -297,7 +297,7 @@ static cl::opt<AttributorRunOption> AttributorRun(
                           "disable attributor runs")));
 
 static cl::opt<bool> EnableSampledInstr(
-    "enable-sampled-instr", cl::init(false), cl::Hidden,
+    "enable-sampled-instrumentation", cl::init(false), cl::Hidden,
     cl::desc("Enable profile instrumentation sampling (default = off)"));
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 994b62e39d3c9..f3530f76e4fa6 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -170,17 +170,18 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
-static cl::opt<bool> SampledInstr("sampled-instr", cl::ZeroOrMore,
+static cl::opt<bool> SampledInstr("sampled-instrumentation", cl::ZeroOrMore,
                                   cl::init(false),
                                   cl::desc("Do PGO instrumentation sampling"));
 
 static cl::opt<unsigned> SampledInstrPeriod(
     "sampled-instr-period",
     cl::desc("Set the profile instrumentation sample period. For each sample "
-             "period, the 'sampled-instr-burst-duration' number of consecutive "
-             "samples will be recorded. The default sample period of 65535 is "
-             "optimized for generating efficient code that leverages unsigned "
-             "integer wrapping in overflow."),
+             "period, a fixed number of consecutive samples will be recorded. "
+             "The number is controlled by 'sampled-instr-burst-duration' flag. "
+             "The default sample period of 65535 is optimized for generating "
+             "efficient code that leverages unsigned integer wrapping in "
+             "overflow."),
     cl::init(65535));
 
 static cl::opt<unsigned> SampledInstrBurstDuration(
@@ -683,10 +684,10 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
 // counters (value-instrumentation and edge instrumentation).
 //
 // (2) Fast burst sampling:
-// The value is an unsigned type, meaning it will wrap around to zero when
-// overflows. In this case, a second check (check2) is unnecessary, so we
-// won't generate check2 when the SampledInstrPeriod is set to 65535 (64K - 1).
-// The code after:
+// "__llvm_profile_sampling__" variable is an unsigned type, meaning it will
+// wrap around to zero when overflows. In this case, the second check is
+// unnecessary, so we won't generate check2 when the SampledInstrPeriod is
+// set to 65535 (64K - 1). The code after:
 //   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
 //     Increment_Instruction;
 //   }
@@ -714,7 +715,10 @@ void InstrLowerer::doSampling(Instruction *I) {
 
   unsigned SampledBurstDuration = SampledInstrBurstDuration.getValue();
   unsigned SampledPeriod = SampledInstrPeriod.getValue();
-  assert(SampledBurstDuration < SampledPeriod);
+  if (SampledBurstDuration >= SampledPeriod) {
+    report_fatal_error(
+        "SampledPeriod needs to be greater than SampledBurstDuration");
+  }
   bool UseShort = (SampledPeriod <= USHRT_MAX);
   bool IsSimpleSampling = (SampledBurstDuration == 1);
   bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
index eeef5b8741501..9d083fe04015e 100644
--- a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s --passes=pgo-instr-gen,instrprof --do-counter-promotion=true --sampled-instr=true --skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof --do-counter-promotion=true --sampled-instrumentation=true --skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
 
 ; SAMPLING: $__llvm_profile_sampling = comdat any
 ; SAMPLING: @__llvm_profile_sampling = thread_local global i16 0, comdat
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
index 6683cae4e64c1..97ad4d00c9d9c 100644
--- a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
+++ b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
@@ -3,7 +3,7 @@
 
 ; RUN: opt -module-summary %s -o %t1.bc
 ; RUN: opt -module-summary %S/Inputs/cspgo_bar_sample.ll -o %t2.bc
-; RUN: llvm-lto2 run -lto-cspgo-profile-file=alloc -enable-sampled-instr -lto-cspgo-gen -save-temps -o %t %t1.bc %t2.bc \
+; RUN: llvm-lto2 run -lto-cspgo-profile-file=alloc -enable-sampled-instrumentation -lto-cspgo-gen -save-temps -o %t %t1.bc %t2.bc \
 ; RUN:   -r=%t1.bc,foo,pl \
 ; RUN:   -r=%t1.bc,bar,l \
 ; RUN:   -r=%t1.bc,main,plx \
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
index a21bdb43f57df..dcc1e805ba6f6 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s --passes=instrprof --sampled-instr -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION,SAMPLE-WEIGHT
-; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-burst-duration=100 -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION100,SAMPLE-WEIGHT100
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION,SAMPLE-WEIGHT
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=100 -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION100,SAMPLE-WEIGHT100
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
index dd1837d5d0149..57d1a0cd33fbe 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof -sampled-instr --sampled-instr-period=1009 --sampled-instr-burst-duration=32 -S | FileCheck %s
+; RUN: opt < %s --passes=instrprof -sampled-instrumentation --sampled-instr-period=1009 --sampled-instr-burst-duration=32 -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
index 32b40e023fa81..1ad889524bc6a 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-period=1000019 --sampled-instr-burst-duration=3000 -S | FileCheck %s
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-period=1000019 --sampled-instr-burst-duration=3000 -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
index e2dff645fd34c..e57e25e37f3c1 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof --sampled-instr --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

>From b8204b00f5c162685c8992e7a8bb832dd53bbdea Mon Sep 17 00:00:00 2001
From: Rong Xu <xur at google.com>
Date: Thu, 18 Jul 2024 10:04:23 -0700
Subject: [PATCH 5/5] [PGO] sampled instrumention

Add some comments and a test per Wenlei's suggestion.
---
 .../Instrumentation/InstrProfiling.cpp        |  5 ++
 .../PGOProfile/instrprof_simple_sampling.ll   | 59 +++++++++++++------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f3530f76e4fa6..c27408dca51aa 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -721,6 +721,8 @@ void InstrLowerer::doSampling(Instruction *I) {
   }
   bool UseShort = (SampledPeriod <= USHRT_MAX);
   bool IsSimpleSampling = (SampledBurstDuration == 1);
+  // If (SampledBurstDuration == 1 && SampledPeriod == 65535), generate
+  // the simple sampling style code.
   bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
 
   auto GetConstant = [UseShort](IRBuilder<> &Builder, uint32_t C) {
@@ -793,6 +795,9 @@ bool InstrLowerer::lowerIntrinsics(Function *F) {
   PromotionCandidates.clear();
   SmallVector<InstrProfInstBase *, 8> InstrProfInsts;
 
+  // To ensure compatibility with sampling, we save the intrinsics into
+  // a buffer to prevent potential breakage of the iterator (as the
+  // intrinsics will be moved to a different BB).
   for (BasicBlock &BB : *F) {
     for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
       if (auto *IP = dyn_cast<InstrProfInstBase>(&Instr))
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
index e57e25e37f3c1..8e846bbf1d982 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s --check-prefix=PERIOD1009
+; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 -S | FileCheck %s --check-prefix=DEFAULTPERIOD
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,23 +11,41 @@ $__llvm_profile_raw_version = comdat any
 @__profn_f = private constant [1 x i8] c"f"
 
 define void @f() {
-; CHECK-LABEL: define void @f() {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], 1009
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
-; CHECK-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
-; CHECK-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    br label %[[BB6:.*]]
-; CHECK:       [[BB5]]:
-; CHECK-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    ret void
+; PERIOD1009-LABEL: define void @f() {
+; PERIOD1009-NEXT:  [[ENTRY:.*:]]
+; PERIOD1009-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; PERIOD1009-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], 1009
+; PERIOD1009-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
+; PERIOD1009:       [[BB3]]:
+; PERIOD1009-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; PERIOD1009-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
+; PERIOD1009-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
+; PERIOD1009-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    br label %[[BB6:.*]]
+; PERIOD1009:       [[BB5]]:
+; PERIOD1009-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
+; PERIOD1009-NEXT:    br label %[[BB6]]
+; PERIOD1009:       [[BB6]]:
+; PERIOD1009-NEXT:    ret void
+;
+; DEFAULTPERIOD-LABEL: define void @f() {
+; DEFAULTPERIOD-NEXT:  [[ENTRY:.*:]]
+; DEFAULTPERIOD-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; DEFAULTPERIOD-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], -1
+; DEFAULTPERIOD-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
+; DEFAULTPERIOD:       [[BB3]]:
+; DEFAULTPERIOD-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
+; DEFAULTPERIOD-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
+; DEFAULTPERIOD-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
+; DEFAULTPERIOD-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    br label %[[BB6:.*]]
+; DEFAULTPERIOD:       [[BB5]]:
+; DEFAULTPERIOD-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
+; DEFAULTPERIOD-NEXT:    br label %[[BB6]]
+; DEFAULTPERIOD:       [[BB6]]:
+; DEFAULTPERIOD-NEXT:    ret void
 ;
 entry:
   call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
@@ -35,5 +54,7 @@ entry:
 
 declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
 ;.
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1009}
+; PERIOD1009: [[PROF0]] = !{!"branch_weights", i32 1, i32 1009}
+;.
+; DEFAULTPERIOD: [[PROF0]] = !{!"branch_weights", i32 1, i32 65535}
 ;.



More information about the llvm-commits mailing list