[llvm] Globaloutline2 (PR #105443)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 23 09:35:31 PDT 2024


https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/105443

>From ad8e93070836da94df51c6d15207b7d1bc80f781 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 23 Aug 2024 08:58:33 -0700
Subject: [PATCH 1/3] [StableHash][NFC] Implement with xxh3_64bits

---
 llvm/include/llvm/ADT/StableHashing.h  | 70 ++++----------------------
 llvm/lib/CodeGen/MachineOperand.cpp    |  3 +-
 llvm/lib/CodeGen/MachineStableHash.cpp | 27 ++++------
 3 files changed, 22 insertions(+), 78 deletions(-)

diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index f675f828f702e5..a5b655a10f6996 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -16,6 +16,7 @@
 #define LLVM_ADT_STABLEHASHING_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/xxhash.h"
 
 namespace llvm {
 
@@ -23,78 +24,29 @@ namespace llvm {
 /// deserialized, and is stable across processes and executions.
 using stable_hash = uint64_t;
 
-// Implementation details
-namespace hashing {
-namespace detail {
-
-// Stable hashes are based on the 64-bit FNV-1 hash:
-// https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function
-
-const uint64_t FNV_PRIME_64 = 1099511628211u;
-const uint64_t FNV_OFFSET_64 = 14695981039346656037u;
-
-inline void stable_hash_append(stable_hash &Hash, const char Value) {
-  Hash = Hash ^ (Value & 0xFF);
-  Hash = Hash * FNV_PRIME_64;
-}
-
-inline void stable_hash_append(stable_hash &Hash, stable_hash Value) {
-  for (unsigned I = 0; I < 8; ++I) {
-    stable_hash_append(Hash, static_cast<char>(Value));
-    Value >>= 8;
-  }
+inline stable_hash stable_hash_combine(ArrayRef<stable_hash> Buffer) {
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(Buffer.data());
+  size_t Size = Buffer.size() * sizeof(stable_hash);
+  return xxh3_64bits(ArrayRef<uint8_t>(Ptr, Size));
 }
 
-} // namespace detail
-} // namespace hashing
-
 inline stable_hash stable_hash_combine(stable_hash A, stable_hash B) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  hashing::detail::stable_hash_append(Hash, A);
-  hashing::detail::stable_hash_append(Hash, B);
-  return Hash;
+  stable_hash Hashes[2] = {A, B};
+  return stable_hash_combine(Hashes);
 }
 
 inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
                                        stable_hash C) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  hashing::detail::stable_hash_append(Hash, A);
-  hashing::detail::stable_hash_append(Hash, B);
-  hashing::detail::stable_hash_append(Hash, C);
-  return Hash;
+  stable_hash Hashes[3] = {A, B, C};
+  return stable_hash_combine(Hashes);
 }
 
 inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
                                        stable_hash C, stable_hash D) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  hashing::detail::stable_hash_append(Hash, A);
-  hashing::detail::stable_hash_append(Hash, B);
-  hashing::detail::stable_hash_append(Hash, C);
-  hashing::detail::stable_hash_append(Hash, D);
-  return Hash;
-}
-
-/// Compute a stable_hash for a sequence of values.
-///
-/// This hashes a sequence of values. It produces the same stable_hash as
-/// 'stable_hash_combine(a, b, c, ...)', but can run over arbitrary sized
-/// sequences and is significantly faster given pointers and types which
-/// can be hashed as a sequence of bytes.
-template <typename InputIteratorT>
-stable_hash stable_hash_combine_range(InputIteratorT First,
-                                      InputIteratorT Last) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  for (auto I = First; I != Last; ++I)
-    hashing::detail::stable_hash_append(Hash, *I);
-  return Hash;
+  stable_hash Hashes[4] = {A, B, C, D};
+  return stable_hash_combine(Hashes);
 }
 
-inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  for (size_t I = 0; I < C; ++I)
-    hashing::detail::stable_hash_append(Hash, P[I]);
-  return Hash;
-}
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index ace05902d5df79..a0726ca64910ea 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -424,8 +424,7 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
       const uint32_t *RegMask = MO.getRegMask();
       std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize);
       return hash_combine(MO.getType(), MO.getTargetFlags(),
-                          stable_hash_combine_array(RegMaskHashes.data(),
-                                                    RegMaskHashes.size()));
+                          stable_hash_combine(RegMaskHashes));
     }
 
     assert(0 && "MachineOperand not associated with any MachineFunction");
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index fb5e9a37d9b997..916acbf2d2cbf9 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -66,7 +66,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
       SmallVector<stable_hash> DefOpcodes;
       for (auto &Def : MRI.def_instructions(MO.getReg()))
         DefOpcodes.push_back(Def.getOpcode());
-      return stable_hash_combine_range(DefOpcodes.begin(), DefOpcodes.end());
+      return stable_hash_combine(DefOpcodes);
     }
 
     // Register operands don't have target flags.
@@ -78,8 +78,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_FPImmediate: {
     auto Val = MO.isCImm() ? MO.getCImm()->getValue()
                            : MO.getFPImm()->getValueAPF().bitcastToAPInt();
-    auto ValHash =
-        stable_hash_combine_array(Val.getRawData(), Val.getNumWords());
+    auto ValHash = stable_hash_combine(
+        ArrayRef<stable_hash>(Val.getRawData(), Val.getNumWords()));
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(), ValHash);
   }
 
@@ -126,10 +126,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
           const uint32_t *RegMask = MO.getRegMask();
           std::vector<llvm::stable_hash> RegMaskHashes(RegMask,
                                                        RegMask + RegMaskSize);
-          return stable_hash_combine(
-              MO.getType(), MO.getTargetFlags(),
-              stable_hash_combine_array(RegMaskHashes.data(),
-                                        RegMaskHashes.size()));
+          return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                                     stable_hash_combine(RegMaskHashes));
         }
       }
     }
@@ -145,10 +143,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
         MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes),
         [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); });
 
-    return stable_hash_combine(
-        MO.getType(), MO.getTargetFlags(),
-        stable_hash_combine_array(ShuffleMaskHashes.data(),
-                                  ShuffleMaskHashes.size()));
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               stable_hash_combine(ShuffleMaskHashes));
   }
   case MachineOperand::MO_MCSymbol: {
     auto SymbolName = MO.getMCSymbol()->getName();
@@ -212,8 +208,7 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
     HashComponents.push_back(static_cast<unsigned>(Op->getFailureOrdering()));
   }
 
-  return stable_hash_combine_range(HashComponents.begin(),
-                                   HashComponents.end());
+  return stable_hash_combine(HashComponents);
 }
 
 stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
@@ -221,8 +216,7 @@ stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
   // TODO: Hash more stuff like block alignment and branch probabilities.
   for (const auto &MI : MBB)
     HashComponents.push_back(stableHashValue(MI));
-  return stable_hash_combine_range(HashComponents.begin(),
-                                   HashComponents.end());
+  return stable_hash_combine(HashComponents);
 }
 
 stable_hash llvm::stableHashValue(const MachineFunction &MF) {
@@ -230,6 +224,5 @@ stable_hash llvm::stableHashValue(const MachineFunction &MF) {
   // TODO: Hash lots more stuff like function alignment and stack objects.
   for (const auto &MBB : MF)
     HashComponents.push_back(stableHashValue(MBB));
-  return stable_hash_combine_range(HashComponents.begin(),
-                                   HashComponents.end());
+  return stable_hash_combine(HashComponents);
 }

>From c9eb32c6f8462cb2d58d1580fda7be66d90b415f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 24 Apr 2024 09:40:34 -0700
Subject: [PATCH 2/3] [MachineOutliner][NFC] Refactor

---
 llvm/include/llvm/CodeGen/MachineOutliner.h  |  5 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 12 ++++-
 llvm/lib/CodeGen/MachineOutliner.cpp         | 55 +++++++++++---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp |  7 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |  3 +-
 5 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index eaba6c9b18f2bb..84937a8b563ac0 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -234,11 +234,11 @@ struct OutlinedFunction {
   unsigned FrameConstructionID = 0;
 
   /// Return the number of candidates for this \p OutlinedFunction.
-  unsigned getOccurrenceCount() const { return Candidates.size(); }
+  virtual unsigned getOccurrenceCount() const { return Candidates.size(); }
 
   /// Return the number of bytes it would take to outline this
   /// function.
-  unsigned getOutliningCost() const {
+  virtual unsigned getOutliningCost() const {
     unsigned CallOverhead = 0;
     for (const Candidate &C : Candidates)
       CallOverhead += C.getCallOverhead();
@@ -272,6 +272,7 @@ struct OutlinedFunction {
   }
 
   OutlinedFunction() = delete;
+  virtual ~OutlinedFunction() = default;
 };
 } // namespace outliner
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 882cadea223695..a833a541e4e025 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2088,14 +2088,22 @@ class TargetInstrInfo : public MCInstrInfo {
 
   /// Returns a \p outliner::OutlinedFunction struct containing target-specific
   /// information for a set of outlining candidates. Returns std::nullopt if the
-  /// candidates are not suitable for outlining.
+  /// candidates are not suitable for outlining. \p MinRep is the minimum
+  /// number of times the instruction sequence must be repeated.
   virtual std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
       const MachineModuleInfo &MMI,
-      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+      unsigned MipRep) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::getOutliningCandidateInfo!");
   }
 
+  virtual std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      const MachineModuleInfo &MMI,
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+    return getOutliningCandidateInfo(MMI, RepeatedSequenceLocs, /*MipRep=*/2);
+  }
+
   /// Optional target hook to create the LLVM IR attributes for the outlined
   /// function. If overridden, the overriding function must call the default
   /// implementation.
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 4b56a467b8d076..eecf27613a2c31 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -456,8 +456,9 @@ struct MachineOutliner : public ModulePass {
   /// \param Mapper Contains outlining mapping information.
   /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
   /// each type of candidate.
-  void findCandidates(InstructionMapper &Mapper,
-                      std::vector<OutlinedFunction> &FunctionList);
+  void
+  findCandidates(InstructionMapper &Mapper,
+                 std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
 
   /// Replace the sequences of instructions represented by \p OutlinedFunctions
   /// with calls to functions.
@@ -465,7 +466,9 @@ struct MachineOutliner : public ModulePass {
   /// \param M The module we are outlining from.
   /// \param FunctionList A list of functions to be inserted into the module.
   /// \param Mapper Contains the instruction mappings for the module.
-  bool outline(Module &M, std::vector<OutlinedFunction> &FunctionList,
+  /// \param[out] OutlinedFunctionNum The outlined function number.
+  bool outline(Module &M,
+               std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList,
                InstructionMapper &Mapper, unsigned &OutlinedFunctionNum);
 
   /// Creates a function for \p OF and inserts it into the module.
@@ -583,7 +586,8 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
 }
 
 void MachineOutliner::findCandidates(
-    InstructionMapper &Mapper, std::vector<OutlinedFunction> &FunctionList) {
+    InstructionMapper &Mapper,
+    std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
   FunctionList.clear();
   SuffixTree ST(Mapper.UnsignedVec, OutlinerLeafDescendants);
 
@@ -684,7 +688,7 @@ void MachineOutliner::findCandidates(
       continue;
     }
 
-    FunctionList.push_back(*OF);
+    FunctionList.push_back(std::make_unique<OutlinedFunction>(*OF));
   }
 }
 
@@ -827,10 +831,9 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   return &MF;
 }
 
-bool MachineOutliner::outline(Module &M,
-                              std::vector<OutlinedFunction> &FunctionList,
-                              InstructionMapper &Mapper,
-                              unsigned &OutlinedFunctionNum) {
+bool MachineOutliner::outline(
+    Module &M, std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList,
+    InstructionMapper &Mapper, unsigned &OutlinedFunctionNum) {
   LLVM_DEBUG(dbgs() << "*** Outlining ***\n");
   LLVM_DEBUG(dbgs() << "NUMBER OF POTENTIAL FUNCTIONS: " << FunctionList.size()
                     << "\n");
@@ -838,23 +841,23 @@ bool MachineOutliner::outline(Module &M,
 
   // Sort by priority where priority := getNotOutlinedCost / getOutliningCost.
   // The function with highest priority should be outlined first.
-  stable_sort(FunctionList,
-              [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) {
-                return LHS.getNotOutlinedCost() * RHS.getOutliningCost() >
-                       RHS.getNotOutlinedCost() * LHS.getOutliningCost();
-              });
+  stable_sort(FunctionList, [](const std::unique_ptr<OutlinedFunction> &LHS,
+                               const std::unique_ptr<OutlinedFunction> &RHS) {
+    return LHS->getNotOutlinedCost() * RHS->getOutliningCost() >
+           RHS->getNotOutlinedCost() * LHS->getOutliningCost();
+  });
 
   // Walk over each function, outlining them as we go along. Functions are
   // outlined greedily, based off the sort above.
   auto *UnsignedVecBegin = Mapper.UnsignedVec.begin();
   LLVM_DEBUG(dbgs() << "WALKING FUNCTION LIST\n");
-  for (OutlinedFunction &OF : FunctionList) {
+  for (auto &OF : FunctionList) {
 #ifndef NDEBUG
-    auto NumCandidatesBefore = OF.Candidates.size();
+    auto NumCandidatesBefore = OF->Candidates.size();
 #endif
     // If we outlined something that overlapped with a candidate in a previous
     // step, then we can't outline from it.
-    erase_if(OF.Candidates, [&UnsignedVecBegin](Candidate &C) {
+    erase_if(OF->Candidates, [&UnsignedVecBegin](Candidate &C) {
       return std::any_of(UnsignedVecBegin + C.getStartIdx(),
                          UnsignedVecBegin + C.getEndIdx() + 1, [](unsigned I) {
                            return I == static_cast<unsigned>(-1);
@@ -862,36 +865,36 @@ bool MachineOutliner::outline(Module &M,
     });
 
 #ifndef NDEBUG
-    auto NumCandidatesAfter = OF.Candidates.size();
+    auto NumCandidatesAfter = OF->Candidates.size();
     LLVM_DEBUG(dbgs() << "PRUNED: " << NumCandidatesBefore - NumCandidatesAfter
                       << "/" << NumCandidatesBefore << " candidates\n");
 #endif
 
     // If we made it unbeneficial to outline this function, skip it.
-    if (OF.getBenefit() < OutlinerBenefitThreshold) {
-      LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF.getBenefit()
+    if (OF->getBenefit() < OutlinerBenefitThreshold) {
+      LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF->getBenefit()
                         << " B) < threshold (" << OutlinerBenefitThreshold
                         << " B)\n");
       continue;
     }
 
-    LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF.getBenefit()
+    LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF->getBenefit()
                       << " B) > threshold (" << OutlinerBenefitThreshold
                       << " B)\n");
 
     // It's beneficial. Create the function and outline its sequence's
     // occurrences.
-    OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
-    emitOutlinedFunctionRemark(OF);
+    OF->MF = createOutlinedFunction(M, *OF, Mapper, OutlinedFunctionNum);
+    emitOutlinedFunctionRemark(*OF);
     FunctionsCreated++;
     OutlinedFunctionNum++; // Created a function, move to the next name.
-    MachineFunction *MF = OF.MF;
+    MachineFunction *MF = OF->MF;
     const TargetSubtargetInfo &STI = MF->getSubtarget();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
 
     // Replace occurrences of the sequence with calls to the new function.
     LLVM_DEBUG(dbgs() << "CREATE OUTLINED CALLS\n");
-    for (Candidate &C : OF.Candidates) {
+    for (Candidate &C : OF->Candidates) {
       MachineBasicBlock &MBB = *C.getMBB();
       MachineBasicBlock::iterator StartIt = C.begin();
       MachineBasicBlock::iterator EndIt = std::prev(C.end());
@@ -1180,7 +1183,7 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
 
   // Prepare instruction mappings for the suffix tree.
   populateMapper(Mapper, M);
-  std::vector<OutlinedFunction> FunctionList;
+  std::vector<std::unique_ptr<OutlinedFunction>> FunctionList;
 
   // Find all of the outlining candidates.
   findCandidates(Mapper, FunctionList);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 697ae510a95655..156ab6568f833e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8687,7 +8687,8 @@ static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
 std::optional<outliner::OutlinedFunction>
 AArch64InstrInfo::getOutliningCandidateInfo(
     const MachineModuleInfo &MMI,
-    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+    std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+    unsigned MinRep) const {
   unsigned SequenceSize = 0;
   for (auto &MI : RepeatedSequenceLocs[0])
     SequenceSize += getInstSizeInBytes(MI);
@@ -8801,7 +8802,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
 
     // If the sequence doesn't have enough candidates left, then we're done.
-    if (RepeatedSequenceLocs.size() < 2)
+    if (RepeatedSequenceLocs.size() < MinRep)
       return std::nullopt;
   }
 
@@ -9048,7 +9049,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     }
 
     // If we dropped all of the candidates, bail out here.
-    if (RepeatedSequenceLocs.size() < 2) {
+    if (RepeatedSequenceLocs.size() < MinRep) {
       RepeatedSequenceLocs.clear();
       return std::nullopt;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a1f2fbff016312..762fb9873065e6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -473,7 +473,8 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                                    bool OutlineFromLinkOnceODRs) const override;
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
       const MachineModuleInfo &MMI,
-      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+      unsigned MinRep) const override;
   void mergeOutliningCandidateAttributes(
       Function &F, std::vector<outliner::Candidate> &Candidates) const override;
   outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI,

>From 77698217d0d272a5ddd15ffb5b65871b4af741f7 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 23 Aug 2024 09:33:47 -0700
Subject: [PATCH 3/3] globaloutline2 on xxh3

---
 llvm/include/llvm/CodeGen/MachineOutliner.h   |  36 +++
 llvm/lib/CGData/CodeGenData.cpp               |  26 +-
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/MachineOutliner.cpp          | 260 +++++++++++++++++-
 llvm/lib/CodeGen/MachineStableHash.cpp        |  20 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 .../CodeGen/AArch64/cgdata-global-hash.ll     |  40 +++
 .../CodeGen/AArch64/cgdata-outlined-name.ll   |  41 +++
 .../AArch64/cgdata-read-double-outline.ll     |  57 ++++
 .../AArch64/cgdata-read-lto-outline.ll        |  96 +++++++
 .../CodeGen/AArch64/cgdata-read-priority.ll   |  68 +++++
 .../AArch64/cgdata-read-single-outline.ll     |  42 +++
 .../CodeGen/AArch64/cgdata-write-outline.ll   |  51 ++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 14 files changed, 734 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-write-outline.ll

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index 84937a8b563ac0..3bda86f399b7b5 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineStableHash.h"
 #include <initializer_list>
 
 namespace llvm {
@@ -274,6 +275,41 @@ struct OutlinedFunction {
   OutlinedFunction() = delete;
   virtual ~OutlinedFunction() = default;
 };
+
+/// The information necessary to create an outlined function that is matched
+/// globally.
+struct GlobalOutlinedFunction : public OutlinedFunction {
+  GlobalOutlinedFunction(OutlinedFunction &OF, unsigned GlobalOccurrenceCount)
+      : OutlinedFunction(OF.Candidates, OF.SequenceSize, OF.FrameOverhead,
+                         OF.FrameConstructionID),
+        GlobalOccurrenceCount(GlobalOccurrenceCount) {}
+
+  unsigned GlobalOccurrenceCount;
+
+  /// Return the number of times that appear globally.
+  /// Global outlining candidate is uniquely created per each match, but this
+  /// might be erased out when it's overlapped with the previous outlining
+  /// instance.
+  unsigned getOccurrenceCount() const override {
+    assert(Candidates.size() <= 1);
+    return Candidates.empty() ? 0 : GlobalOccurrenceCount;
+  }
+
+  /// Return the outlining cost using the global occurrence count
+  /// with the same cost as the first (unique) candidate.
+  unsigned getOutliningCost() const override {
+    assert(Candidates.size() <= 1);
+    unsigned CallOverhead =
+        Candidates.empty()
+            ? 0
+            : Candidates[0].getCallOverhead() * getOccurrenceCount();
+    return CallOverhead + SequenceSize + FrameOverhead;
+  }
+
+  GlobalOutlinedFunction() = delete;
+  ~GlobalOutlinedFunction() = default;
+};
+
 } // namespace outliner
 } // namespace llvm
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 9dd4b1674e094a..55d2504231c744 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -24,6 +24,13 @@
 using namespace llvm;
 using namespace cgdata;
 
+cl::opt<bool>
+    CodeGenDataGenerate("codegen-data-generate", cl::init(false), cl::Hidden,
+                        cl::desc("Emit CodeGen Data into custom sections"));
+cl::opt<std::string>
+    CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
+                       cl::desc("File path to where .cgdata file is read"));
+
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
   std::string Msg;
@@ -132,7 +139,24 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    // TODO: Initialize writer or reader mode for the client optimization.
+    if (CodeGenDataGenerate)
+      Instance->EmitCGData = true;
+    else if (!CodeGenDataUsePath.empty()) {
+      // Initialize the global CGData if the input file name is given.
+      // We do not error-out when failing to parse the input file.
+      // Instead, just emit an warning message and fall back as if no CGData
+      // were available.
+      auto FS = vfs::getRealFileSystem();
+      auto ReaderOrErr = CodeGenDataReader::create(CodeGenDataUsePath, *FS);
+      if (Error E = ReaderOrErr.takeError()) {
+        warn(std::move(E), CodeGenDataUsePath);
+        return;
+      }
+      // Publish each CGData based on the data type in the header.
+      auto Reader = ReaderOrErr->get();
+      if (Reader->hasOutlinedHashTree())
+        Instance->publishOutlinedHashTree(Reader->releaseOutlinedHashTree());
+    }
   });
   return *(Instance.get());
 }
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f1607f85c5b319..3e75737185c3ee 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -267,6 +267,7 @@ add_llvm_component_library(LLVMCodeGen
   Analysis
   BitReader
   BitWriter
+  CGData
   CodeGenTypes
   Core
   MC
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index eecf27613a2c31..b8b108dcc5be4b 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,7 +59,9 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -75,6 +77,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <functional>
 #include <tuple>
 #include <vector>
@@ -98,6 +101,10 @@ STATISTIC(NumInvisible,
           "Invisible instructions skipped during mapping");
 STATISTIC(UnsignedVecSize,
           "Total number of instructions mapped and saved to mapping vector");
+STATISTIC(StableHashAttempts,
+          "Count of hashing attempts made for outlined functions");
+STATISTIC(StableHashDropped,
+          "Count of unsuccessful hashing attempts for outlined functions");
 
 // Set to true if the user wants the outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
@@ -128,6 +135,19 @@ static cl::opt<bool> OutlinerLeafDescendants(
              "tree as candidates for outlining (if false, only leaf children "
              "are considered)"));
 
+static cl::opt<bool>
+    DisableGlobalOutlining("disable-global-outlining", cl::Hidden,
+                           cl::desc("Disable global outlining only by ignoring "
+                                    "the codegen data generation or use"),
+                           cl::init(false));
+
+static cl::opt<bool> AppendContentHashToOutlinedName(
+    "append-content-hash-outlined-name", cl::Hidden,
+    cl::desc("This appends the content hash to the globally outlined function "
+             "name. It's beneficial for enhancing the precision of the stable "
+             "hash and for ordering the outlined functions."),
+    cl::init(true));
+
 namespace {
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -421,11 +441,29 @@ struct MachineOutliner : public ModulePass {
   /// Set when the pass is constructed in TargetPassConfig.
   bool RunOnAllFunctions = true;
 
+  /// This is a compact representation of hash sequences of outlined functions.
+  /// It is used when OutlinerMode = CGDataMode::Write.
+  /// The resulting hash tree will be emitted into __llvm_outlined section
+  /// which will be dead-stripped not going to the final binary.
+  /// A post-process using llvm-cgdata, lld, or ThinLTO can merge them into
+  /// a global oulined hash tree for the subsequent codegen.
+  std::unique_ptr<OutlinedHashTree> LocalHashTree;
+
+  /// The mode of the outliner.
+  /// When is's CGDataMode::None, candidates are populated with the suffix tree
+  /// within a module and outlined.
+  /// When it's CGDataMode::Write, in addition to CGDataMode::None, the hash
+  /// sequences of outlined functions are published into LocalHashTree.
+  /// When it's CGDataMode::Read, candidates are populated with the global
+  /// outlined hash tree that has been built by the previous codegen.
+  CGDataMode OutlinerMode = CGDataMode::None;
+
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.addRequired<ImmutableModuleSummaryIndexWrapperPass>();
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -460,6 +498,16 @@ struct MachineOutliner : public ModulePass {
   findCandidates(InstructionMapper &Mapper,
                  std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
 
+  /// Find all repeated substrings that match in the global outlined hash
+  /// tree built from the previous codegen.
+  ///
+  /// \param Mapper Contains outlining mapping information.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
+  /// each type of candidate.
+  void findGlobalCandidates(
+      InstructionMapper &Mapper,
+      std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
+
   /// Replace the sequences of instructions represented by \p OutlinedFunctions
   /// with calls to functions.
   ///
@@ -476,6 +524,17 @@ struct MachineOutliner : public ModulePass {
                                           InstructionMapper &Mapper,
                                           unsigned Name);
 
+  /// Compute and publish the stable hash sequence of instructions in the
+  /// outlined function, \p MF. The parameter \p CandSize represents the number
+  /// of candidates that have identical instruction sequences to \p MF.
+  void computeAndPublishHashSequence(MachineFunction &MF, unsigned CandSize);
+
+  /// Initialize the outliner mode.
+  void initializeOutlinerMode(const Module &M);
+
+  /// Emit the outlined hash tree into __llvm_outline section.
+  void emitOutlinedHashTree(Module &M);
+
   /// Calls 'doOutline()' 1 + OutlinerReruns times.
   bool runOnModule(Module &M) override;
 
@@ -585,6 +644,109 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   MORE.emit(R);
 }
 
+struct MatchedEntry {
+  size_t StartIdx;
+  size_t Length;
+  size_t Count;
+};
+
+static const HashNode *followHashNode(stable_hash StableHash,
+                                      const HashNode *Current) {
+  auto I = Current->Successors.find(StableHash);
+  return (I == Current->Successors.end()) ? nullptr : I->second.get();
+}
+
+// Find all matches in the global outlined hash tree.
+// It's quadratic complexity in theory, but it's nearly linear in practice
+// since the length of outlined sequences are small within a block.
+static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
+  auto &InstrList = Mapper.InstrList;
+  auto &UnsignedVec = Mapper.UnsignedVec;
+
+  std::vector<MatchedEntry> MatchedEntries;
+  std::vector<stable_hash> Sequence;
+  auto Size = UnsignedVec.size();
+
+  // Get the global outlined hash tree built from the previous run.
+  assert(cgdata::hasOutlinedHashTree());
+  const auto *RootNode = cgdata::getOutlinedHashTree()->getRoot();
+  for (size_t I = 0; I < Size; ++I) {
+    // skip the invalid mapping that represents a large negative value.
+    if (UnsignedVec[I] >= Size)
+      continue;
+    const MachineInstr &MI = *InstrList[I];
+    // skip debug instructions as we did for the outlined function.
+    if (MI.isDebugInstr())
+      continue;
+    // skip the empty hash value.
+    stable_hash StableHashI = stableHashValue(MI);
+    if (!StableHashI)
+      continue;
+    Sequence.clear();
+    Sequence.push_back(StableHashI);
+
+    const HashNode *LastNode = followHashNode(StableHashI, RootNode);
+    if (!LastNode)
+      continue;
+
+    size_t J = I + 1;
+    for (; J < Size; ++J) {
+      // break on the invalid mapping that represents a large negative value.
+      if (UnsignedVec[J] >= Size)
+        break;
+      // ignore debug instructions as we did for the outlined function.
+      const MachineInstr &MJ = *InstrList[J];
+      if (MJ.isDebugInstr())
+        continue;
+      // break on the empty hash value.
+      stable_hash StableHashJ = stableHashValue(MJ);
+      if (!StableHashJ)
+        break;
+      LastNode = followHashNode(StableHashJ, LastNode);
+      if (!LastNode)
+        break;
+
+      // Even with a match ending with a terminal, we continue finding
+      // matches to populate all candidates.
+      Sequence.push_back(StableHashJ);
+      auto Count = LastNode->Terminals;
+      if (Count)
+        MatchedEntries.push_back({I, J - I + 1, *Count});
+    }
+  }
+
+  return MatchedEntries;
+}
+
+void MachineOutliner::findGlobalCandidates(
+    InstructionMapper &Mapper,
+    std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
+  FunctionList.clear();
+  auto &InstrList = Mapper.InstrList;
+  auto &MBBFlagsMap = Mapper.MBBFlagsMap;
+
+  std::vector<Candidate> CandidatesForRepeatedSeq;
+  for (auto &ME : getMatchedEntries(Mapper)) {
+    CandidatesForRepeatedSeq.clear();
+    MachineBasicBlock::iterator StartIt = InstrList[ME.StartIdx];
+    MachineBasicBlock::iterator EndIt = InstrList[ME.StartIdx + ME.Length - 1];
+    MachineBasicBlock *MBB = StartIt->getParent();
+    Candidate C(ME.StartIdx, ME.Length, StartIt, EndIt, MBB,
+                FunctionList.size(), MBBFlagsMap[MBB]);
+    CandidatesForRepeatedSeq.push_back(C);
+    const TargetInstrInfo *TII = C.getMF()->getSubtarget().getInstrInfo();
+    std::optional<OutlinedFunction> OF = TII->getOutliningCandidateInfo(
+        *MMI, CandidatesForRepeatedSeq, /*MinRep=*/1);
+    if (!OF || OF->Candidates.empty())
+      continue;
+    // We create a global candidate each match.
+    assert(OF->Candidates.size() == 1);
+
+    FunctionList.push_back(
+        std::make_unique<GlobalOutlinedFunction>(*OF, ME.Count));
+  }
+}
+
 void MachineOutliner::findCandidates(
     InstructionMapper &Mapper,
     std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
@@ -692,6 +854,40 @@ void MachineOutliner::findCandidates(
   }
 }
 
+void MachineOutliner::computeAndPublishHashSequence(MachineFunction &MF,
+                                                    unsigned CandSize) {
+  // Compute the hash sequence for the outlined function.
+  SmallVector<stable_hash> OutlinedHashSequence;
+  for (auto &MBB : MF) {
+    for (auto &NewMI : MBB) {
+      stable_hash Hash = stableHashValue(NewMI);
+      if (!Hash) {
+        OutlinedHashSequence.clear();
+        break;
+      }
+      OutlinedHashSequence.push_back(Hash);
+    }
+  }
+
+  // Append a unique name based on the non-empty hash sequence.
+  if (AppendContentHashToOutlinedName && !OutlinedHashSequence.empty()) {
+    auto CombinedHash = stable_hash_combine_range(OutlinedHashSequence.begin(),
+                                                  OutlinedHashSequence.end());
+    auto NewName =
+        MF.getName().str() + ".content." + std::to_string(CombinedHash);
+    MF.getFunction().setName(NewName);
+  }
+
+  // Publish the non-empty hash sequence to the local hash tree.
+  if (OutlinerMode == CGDataMode::Write) {
+    StableHashAttempts++;
+    if (!OutlinedHashSequence.empty())
+      LocalHashTree->insert({OutlinedHashSequence, CandSize});
+    else
+      StableHashDropped++;
+  }
+}
+
 MachineFunction *MachineOutliner::createOutlinedFunction(
     Module &M, OutlinedFunction &OF, InstructionMapper &Mapper, unsigned Name) {
 
@@ -767,6 +963,9 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
     }
   }
 
+  if (OutlinerMode != CGDataMode::None)
+    computeAndPublishHashSequence(MF, OF.Candidates.size());
+
   // Set normal properties for a late MachineFunction.
   MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
   MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
@@ -1131,12 +1330,65 @@ void MachineOutliner::emitInstrCountChangedRemark(
   }
 }
 
+void MachineOutliner::initializeOutlinerMode(const Module &M) {
+  if (DisableGlobalOutlining)
+    return;
+
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>()) {
+    auto *TheIndex = IndexWrapperPass->getIndex();
+    // (Full)LTO module does not have functions added to the index.
+    // In this case, we run the outliner without using codegen data as usual.
+    if (TheIndex && !TheIndex->hasExportedFunctions(M))
+      return;
+  }
+
+  // When codegen data write is enabled, we want to write the local outlined
+  // hash tree to the custom section, `__llvm_outline`.
+  // When the outlined hash tree is available from the previous codegen data,
+  // we want to read it to optimistically create global outlining candidates.
+  if (cgdata::emitCGData()) {
+    OutlinerMode = CGDataMode::Write;
+    // Create a local outlined hash tree to be published.
+    LocalHashTree.reset(new OutlinedHashTree());
+    // We don't need to read the outlined hash tree from the previous codegen
+  } else if (cgdata::hasOutlinedHashTree())
+    OutlinerMode = CGDataMode::Read;
+}
+
+void MachineOutliner::emitOutlinedHashTree(Module &M) {
+  assert(LocalHashTree);
+  if (!LocalHashTree->empty()) {
+    LLVM_DEBUG({
+      dbgs() << "Emit outlined hash tree. Size: " << LocalHashTree->size()
+             << "\n";
+    });
+    SmallVector<char> Buf;
+    raw_svector_ostream OS(Buf);
+
+    OutlinedHashTreeRecord HTR(std::move(LocalHashTree));
+    HTR.serialize(OS);
+
+    llvm::StringRef Data(Buf.data(), Buf.size());
+    std::unique_ptr<MemoryBuffer> Buffer =
+        MemoryBuffer::getMemBuffer(Data, "in-memory outlined hash tree", false);
+
+    Triple TT(M.getTargetTriple());
+    embedBufferInModule(
+        M, *Buffer.get(),
+        getCodeGenDataSectionName(CG_outline, TT.getObjectFormat()));
+  }
+}
+
 bool MachineOutliner::runOnModule(Module &M) {
   // Check if there's anything in the module. If it's empty, then there's
   // nothing to outline.
   if (M.empty())
     return false;
 
+  // Initialize the outliner mode.
+  initializeOutlinerMode(M);
+
   MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 
   // Number to append to the current outlined function.
@@ -1158,6 +1410,9 @@ bool MachineOutliner::runOnModule(Module &M) {
     }
   }
 
+  if (OutlinerMode == CGDataMode::Write)
+    emitOutlinedHashTree(M);
+
   return true;
 }
 
@@ -1186,7 +1441,10 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   std::vector<std::unique_ptr<OutlinedFunction>> FunctionList;
 
   // Find all of the outlining candidates.
-  findCandidates(Mapper, FunctionList);
+  if (OutlinerMode == CGDataMode::Read)
+    findGlobalCandidates(Mapper, FunctionList);
+  else
+    findCandidates(Mapper, FunctionList);
 
   // If we've requested size remarks, then collect the MI counts of every
   // function before outlining, and the MI counts after outlining.
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 916acbf2d2cbf9..844bc9e36442f4 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -95,9 +95,22 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_Metadata:
     StableHashBailingMetadataUnsupported++;
     return 0;
-  case MachineOperand::MO_GlobalAddress:
-    StableHashBailingGlobalAddress++;
-    return 0;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    if (GV->hasPrivateLinkage() || !GV->hasName()) {
+      StableHashBailingGlobalAddress++;
+      return 0;
+    }
+    auto Name = GV->getName();
+    // Use the content hash of the outlined function.
+    auto Pos = Name.find_last_of(".content.");
+    if (Pos != StringRef::npos) {
+      assert(Name.starts_with("OUTLINED_FUNCTION"));
+      Name = Name.substr(Pos);
+    }
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               xxh3_64bits(Name), MO.getOffset());
+  }
   case MachineOperand::MO_TargetIndex: {
     if (const char *Name = MO.getTargetIndexName())
       return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
@@ -142,7 +155,6 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
     llvm::transform(
         MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes),
         [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); });
-
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
                                stable_hash_combine(ShuffleMaskHashes));
   }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 3465b717261cf5..66ce960462c63d 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -16,6 +16,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
diff --git a/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
new file mode 100644
index 00000000000000..c425eda56f5d5b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
@@ -0,0 +1,40 @@
+; This test verifies the stable hash values for different global variables
+; that have distinct names.
+; We generate two different cgdata files from nearly identical outline instances,
+; with the only difference being the last call target globals, @g vs @h.
+
+; RUN: split-file %s %t
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-g.ll -o %t/local-g.o
+; RUN: llvm-cgdata --merge %t/local-g.o -o %t/local-g.cgdata
+; RUN: llvm-cgdata --convert %t/local-g.cgdata -o %t/local-g.cgtext
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-h.ll -o %t/local-h.o
+; RUN: llvm-cgdata --merge %t/local-h.o -o %t/local-h.cgdata
+; RUN: llvm-cgdata --convert %t/local-h.cgdata -o %t/local-h.cgtext
+
+; We compare the trees which are only different at the terminal node's hash value.
+; Here we simply count the different lines that have `Hash` string.
+; RUN: not diff %t/local-g.cgtext %t/local-h.cgtext 2>&1 | grep Hash | wc -l | FileCheck %s
+; CHECK: 2
+
+;--- local-g.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-h.ll
+declare i32 @h(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @h(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @h(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll b/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
new file mode 100644
index 00000000000000..69f1ecd6515e7e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
@@ -0,0 +1,41 @@
+; This test verifies the globally outlined function name has the content hash.
+
+; RUN: split-file %s %t
+
+; Check if the outlined function name has the content hash depending the flag.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -append-content-hash-outlined-name=false -filetype=obj %t/local-two.ll -o %t_write_base
+; RUN: llvm-objdump -d %t_write_base | FileCheck %s --check-prefix=BASE
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -append-content-hash-outlined-name=true -filetype=obj %t/local-two.ll -o %t_write_suffix
+; RUN: llvm-objdump -d %t_write_suffix | FileCheck %s --check-prefix=SUFFIX
+; BASE-NOT: _OUTLINED_FUNCTION_{{.*}}.content.{{[0-9]+}}
+; SUFFIX: _OUTLINED_FUNCTION_{{.*}}.content.{{[0-9]+}}
+
+; Generate the cgdata file from each case and show they are identical.
+; RUN: llvm-cgdata --merge %t_write_base -o %t_cgdata_base
+; RUN: llvm-cgdata --merge %t_write_suffix -o %t_cgdata_suffix
+; RUN: diff %t_cgdata_base %t_cgdata_suffix
+
+; Read the cgdata in the machine outliner for optimistically outlining in local-one.ll.
+; Check if the outlined function has the content hash depending the flag.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_base -append-content-hash-outlined-name=false -filetype=obj %t/local-one.ll -o %t_read_base
+; RUN: llvm-objdump -d %t_read_base | FileCheck %s --check-prefix=BASE
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_suffix -append-content-hash-outlined-name=true -filetype=obj %t/local-one.ll -o %t_read_suffix
+; RUN: llvm-objdump -d %t_read_suffix | FileCheck %s --check-prefix=SUFFIX
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
new file mode 100644
index 00000000000000..6e027308c17068
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
@@ -0,0 +1,57 @@
+; This test demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to share an outlined function for identical sequences.
+; Instead, each instruction sequence that matches against the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata for local-two-another.ll and proceed to optimistically outline
+; each instruction sequence that matches against the global outlined hash tree.
+; Since each matching sequence is considered a candidate, we expect to generate two
+; unique outlined functions. These functions, although unique, will be identical in code,
+; and thus, will be folded by the linker.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-two-another.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
new file mode 100644
index 00000000000000..f1a5d1a0ccc7f0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
@@ -0,0 +1,96 @@
+; This test is similar to cgdata-read-double-outline.ll, but it is executed with LTO (Link Time Optimization).
+; It demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to reuse an outlined function for identical sequences.
+; Instead, each instruction sequence that appears in the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; We first create the cgdata file from a local outline instance in local-two.ll
+; RUN: opt -module-summary %t/local-two.ll -o %t/write.bc
+; RUN: llvm-lto2 run %t/write.bc -o %t/write \
+; RUN:  -r %t/write.bc,_f1,px -r %t/write.bc,_f2,px -r %t/write.bc,_g,p \
+; RUN:  -codegen-data-generate=true
+; RUN: llvm-cgdata --merge %t/write.1 -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we execute either ThinLTO or LTO by reading the cgdata for local-two-another.ll.
+; With ThinLTO, similar to the no-LTO scenario shown in cgdata-read-double-outline.ll,
+; it optimistically outlines each instruction sequence that matches against
+; the global outlined hash tree. Since each matching sequence is considered a candidate,
+; we expect to generate two unique outlined functions that will be folded
+; by the linker at a later stage.
+; However, with LTO, we do not utilize the cgdata, but instead fall back to the default
+; outliner mode. This results in a single outlined function that is
+; shared across two call-sites.
+
+; Run ThinLTO
+; RUN: opt -module-summary %t/local-two-another.ll -o %t/thinlto.bc
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; Run ThinLTO while disabling the global outliner.
+; We have a single outlined case with the default outliner.
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto-disable \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -enable-machine-outliner \
+; RUN:  -codegen-data-use-path=%t_cgdata \
+; RUN:  -disable-global-outlining
+; RUN: llvm-objdump -d %t/thinlto-disable.1 | FileCheck %s --check-prefix=DISABLE
+
+; DISABLE: _OUTLINED_FUNCTION_{{.*}}:
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  b
+; DISABLE-NOT: _OUTLINED_FUNCTION_{{.*}}:
+
+; Run LTO, which effectively disables the global outliner.
+; RUN: opt %t/local-two-another.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/lto.bc -o %t/lto \
+; RUN:  -r %t/lto.bc,_f3,px -r %t/lto.bc,_f4,px -r %t/lto.bc,_g,p \
+; RUN:  -enable-machine-outliner \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/lto.0 | FileCheck %s --check-prefix=DISABLE
+
+;--- local-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
new file mode 100644
index 00000000000000..affeea8c71acd3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
@@ -0,0 +1,68 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+; When multiple matches occur, we prioritize the candidates using the global frequency.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from local outline instances present in write1.ll and write2.ll
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/write1.ll -o %t_write1
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/write2.ll -o %t_write2
+; RUN: llvm-cgdata --merge %t_write1 %t_write2 -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 8
+; SHOW-NEXT:  Terminal Node Count: 2
+; SHOW-NEXT:  Depth: 4
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in read.ll that matches against the cgdata.
+; There are two matches -- (1) (mov #1, mov #2, mov #3, b) and (2) (mov #2, mov #3, b).
+; Even though sequence (1) is longer than sequence (2), the latter is outlined because it occurs more frequently in the outlined hash tree.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/read.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- write1.ll
+; The sequence (mov #2, mov #3, b) are repeated 4 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 50, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 60, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 70, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 80, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- write2.ll
+; The sequence (mov #1, mov #2, mov #3, b) are repeated 2 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f7() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- read.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
new file mode 100644
index 00000000000000..7725648a6bc3d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
@@ -0,0 +1,42 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
new file mode 100644
index 00000000000000..09ad499190ee37
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
@@ -0,0 +1,51 @@
+; This test verifies whether an outlined function is encoded into the __llvm_outline section
+; when the -codegen-data-generate flag is used.
+
+; Verify whether an outlined function is always created, but only encoded into the section when the flag is used.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %s -o %t_save
+; RUN: llvm-objdump -d %t_save | FileCheck %s
+; RUN: llvm-objdump -h %t_save | FileCheck %s --check-prefix=SECTNAME
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=false -filetype=obj %s -o %t_nosave
+; RUN: llvm-objdump -d  %t_nosave | FileCheck %s
+; RUN: llvm-objdump -h %t_nosave | FileCheck %s --check-prefix=NOSECTNAME
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; SECTNAME: __llvm_outline
+; NOSECTNAME-NOT: __llvm_outline
+
+; Verify the content of cgdata after it has been processed with llvm-cgdata.
+; RUN: llvm-cgdata --merge %t_save -o %t_cgdata
+; RUN: llvm-cgdata --convert %t_cgdata | FileCheck %s --check-prefix=TREE
+
+; TREE: :outlined_hash_tree
+; TREE: ---
+; TREE-NEXT: 0:
+; TREE-NEXT:   Hash:            0x0
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 1 ]
+; TREE-NEXT: 1:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 2 ]
+; TREE-NEXT: 2:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 3 ]
+; TREE-NEXT: 3:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       2
+; TREE-NEXT:   SuccessorIds:    [  ]
+; TREE-NEXT: ...
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 44c270fdc3c257..7749f0db0c54d3 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -20,6 +20,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager



More information about the llvm-commits mailing list