[llvm] X86: Add prefetch insertion based on Propeller profile (PR #166324)

Rahman Lavaee via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 3 23:32:11 PST 2025


https://github.com/rlavaee created https://github.com/llvm/llvm-project/pull/166324

This commit introduces a new pass for prefetch insertion on X86 targets. The pass utilizes Propeller profiles to guide prefetch placement, optimizing memory access patterns.

The new file llvm/lib/Target/X86/PrefetchInsertion.cpp implements this functionality. This commit also includes necessary modifications to related CodeGen and X86 target files to integrate the new pass.

A build issue where PrefetchInsertion.cpp was not included in the CMakeLists.txt was also resolved.

>From 3e6212c4466749e78dca65aff7f334f00f6b3a16 Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl at google.com>
Date: Tue, 4 Nov 2025 07:29:48 +0000
Subject: [PATCH] X86: Add prefetch insertion based on Propeller profile

This commit introduces a new pass for prefetch insertion on X86 targets.
The pass utilizes Propeller profiles to guide prefetch placement,
optimizing memory access patterns.

The new file llvm/lib/Target/X86/PrefetchInsertion.cpp implements this
functionality. This commit also includes necessary modifications to
related CodeGen and X86 target files to integrate the new pass.

A build issue where PrefetchInsertion.cpp was not included in the
CMakeLists.txt was also resolved.
---
 .../CodeGen/BasicBlockSectionsProfileReader.h |  49 +++-
 llvm/include/llvm/CodeGen/MachineBasicBlock.h | 170 ++++++-------
 llvm/include/llvm/CodeGen/MachineInstr.h      |  90 +++----
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    | 176 +++++++++-----
 llvm/lib/CodeGen/BasicBlockSections.cpp       |   6 +-
 .../BasicBlockSectionsProfileReader.cpp       |  86 ++++++-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  66 +++---
 llvm/lib/CodeGen/MachineBasicBlock.cpp        |  88 ++++---
 llvm/lib/Target/X86/CMakeLists.txt            | 223 +++++++++---------
 llvm/lib/Target/X86/PrefetchInsertion.cpp     | 209 ++++++++++++++++
 llvm/lib/Target/X86/X86.h                     |  13 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |  16 +-
 13 files changed, 794 insertions(+), 399 deletions(-)
 create mode 100644 llvm/lib/Target/X86/PrefetchInsertion.cpp

diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 48650a6df22ff..b288374a38226 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -42,6 +42,17 @@ struct BBClusterInfo {
   unsigned PositionInCluster;
 };
 
+struct BBPosition {
+  UniqueBBID BBID;
+  unsigned BBOffset;
+};
+
+struct PrefetchHint {
+  BBPosition SitePosition;
+  StringRef TargetFunctionName;
+  BBPosition TargetPosition;
+};
+
 // This represents the raw input profile for one function.
 struct FunctionPathAndClusterInfo {
   // BB Cluster information specified by `UniqueBBID`s.
@@ -50,19 +61,42 @@ struct FunctionPathAndClusterInfo {
   // the edge a -> b (a is not cloned). The index of the path in this vector
   // determines the `UniqueBBID::CloneID` of the cloned blocks in that path.
   SmallVector<SmallVector<unsigned>> ClonePaths;
+  SmallVector<PrefetchHint> PrefetchHints;
+  DenseSet<BBPosition> PrefetchTargets;
   // Node counts for each basic block.
   DenseMap<UniqueBBID, uint64_t> NodeCounts;
-  // Edge counts for each edge, stored as a nested map.
+  // Edge counts for each edge.
   DenseMap<UniqueBBID, DenseMap<UniqueBBID, uint64_t>> EdgeCounts;
 };
 
+// Provides DenseMapInfo BBPosition.
+template <> struct DenseMapInfo<BBPosition> {
+  static inline BBPosition getEmptyKey() {
+    return {DenseMapInfo<UniqueBBID>::getEmptyKey(),
+            DenseMapInfo<unsigned>::getEmptyKey()};
+  }
+  static inline BBPosition getTombstoneKey() {
+    return BBPosition{DenseMapInfo<UniqueBBID>::getTombstoneKey(),
+                      DenseMapInfo<unsigned>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const BBPosition &Val) {
+    std::pair<unsigned, unsigned> PairVal = std::make_pair(
+        DenseMapInfo<UniqueBBID>::getHashValue(Val.BBID), Val.BBOffset);
+    return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
+  }
+  static bool isEqual(const BBPosition &LHS, const BBPosition &RHS) {
+    return DenseMapInfo<UniqueBBID>::isEqual(LHS.BBID, RHS.BBID) &&
+           DenseMapInfo<unsigned>::isEqual(LHS.BBOffset, RHS.BBOffset);
+  }
+};
+
 class BasicBlockSectionsProfileReader {
 public:
   friend class BasicBlockSectionsProfileReaderWrapperPass;
   BasicBlockSectionsProfileReader(const MemoryBuffer *Buf)
-      : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'){};
+      : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#') {};
 
-  BasicBlockSectionsProfileReader(){};
+  BasicBlockSectionsProfileReader() {};
 
   // Returns true if basic block sections profile exist for function \p
   // FuncName.
@@ -86,6 +120,11 @@ class BasicBlockSectionsProfileReader {
   uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID,
                         const UniqueBBID &SinkBBID) const;
 
+  SmallVector<PrefetchHint>
+  getPrefetchHintsForFunction(StringRef FuncName) const;
+
+  DenseSet<BBPosition> getPrefetchTargetsForFunction(StringRef FuncName) const;
+
 private:
   StringRef getAliasName(StringRef FuncName) const {
     auto R = FuncAliasMap.find(FuncName);
@@ -194,6 +233,10 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass {
 
   uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID,
                         const UniqueBBID &DestBBID) const;
+  SmallVector<PrefetchHint>
+  getPrefetchHintsForFunction(StringRef FuncName) const;
+
+  DenseSet<BBPosition> getPrefetchTargetsForFunction(StringRef FuncName) const;
 
   // Initializes the FunctionNameToDIFilename map for the current module and
   // then reads the profile for the matching functions.
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 71739278cf513..deff97416df23 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -100,6 +100,12 @@ template <> struct DenseMapInfo<MBBSectionID> {
   }
 };
 
+struct PrefetchTarget {
+  StringRef TargetFunction;
+  UniqueBBID TargetBBID;
+  unsigned TargetBBOffset;
+};
+
 template <> struct ilist_traits<MachineInstr> {
 private:
   friend class MachineBasicBlock; // Set by the owning MachineBasicBlock.
@@ -213,6 +219,8 @@ class MachineBasicBlock
   /// basic block sections and basic block labels.
   std::optional<UniqueBBID> BBID;
 
+  SmallVector<unsigned> PrefetchTargets;
+
   /// With basic block sections, this stores the Section ID of the basic block.
   MBBSectionID SectionID{0};
 
@@ -229,6 +237,8 @@ class MachineBasicBlock
   /// is only computed once and is cached.
   mutable MCSymbol *CachedMCSymbol = nullptr;
 
+  mutable SmallVector<MCSymbol *, 4> CallInstSymbols;
+
   /// Cached MCSymbol for this block (used if IsEHContTarget).
   mutable MCSymbol *CachedEHContMCSymbol = nullptr;
 
@@ -254,9 +264,7 @@ class MachineBasicBlock
 
   /// Remove the reference to the underlying IR BasicBlock. This is for
   /// reduction tools and should generally not be used.
-  void clearBasicBlock() {
-    BB = nullptr;
-  }
+  void clearBasicBlock() { BB = nullptr; }
 
   /// Check if there is a name of corresponding LLVM basic block.
   LLVM_ABI bool hasName() const;
@@ -348,24 +356,24 @@ class MachineBasicBlock
   LLVM_ABI bool sizeWithoutDebugLargerThan(unsigned Limit) const;
   bool empty() const { return Insts.empty(); }
 
-  MachineInstr       &instr_front()       { return Insts.front(); }
-  MachineInstr       &instr_back()        { return Insts.back();  }
+  MachineInstr &instr_front() { return Insts.front(); }
+  MachineInstr &instr_back() { return Insts.back(); }
   const MachineInstr &instr_front() const { return Insts.front(); }
-  const MachineInstr &instr_back()  const { return Insts.back();  }
-
-  MachineInstr       &front()             { return Insts.front(); }
-  MachineInstr       &back()              { return *--end();      }
-  const MachineInstr &front()       const { return Insts.front(); }
-  const MachineInstr &back()        const { return *--end();      }
-
-  instr_iterator                instr_begin()       { return Insts.begin();  }
-  const_instr_iterator          instr_begin() const { return Insts.begin();  }
-  instr_iterator                  instr_end()       { return Insts.end();    }
-  const_instr_iterator            instr_end() const { return Insts.end();    }
-  reverse_instr_iterator       instr_rbegin()       { return Insts.rbegin(); }
+  const MachineInstr &instr_back() const { return Insts.back(); }
+
+  MachineInstr &front() { return Insts.front(); }
+  MachineInstr &back() { return *--end(); }
+  const MachineInstr &front() const { return Insts.front(); }
+  const MachineInstr &back() const { return *--end(); }
+
+  instr_iterator instr_begin() { return Insts.begin(); }
+  const_instr_iterator instr_begin() const { return Insts.begin(); }
+  instr_iterator instr_end() { return Insts.end(); }
+  const_instr_iterator instr_end() const { return Insts.end(); }
+  reverse_instr_iterator instr_rbegin() { return Insts.rbegin(); }
   const_reverse_instr_iterator instr_rbegin() const { return Insts.rbegin(); }
-  reverse_instr_iterator       instr_rend  ()       { return Insts.rend();   }
-  const_reverse_instr_iterator instr_rend  () const { return Insts.rend();   }
+  reverse_instr_iterator instr_rend() { return Insts.rend(); }
+  const_reverse_instr_iterator instr_rend() const { return Insts.rend(); }
 
   using instr_range = iterator_range<instr_iterator>;
   using const_instr_range = iterator_range<const_instr_iterator>;
@@ -374,10 +382,10 @@ class MachineBasicBlock
     return const_instr_range(instr_begin(), instr_end());
   }
 
-  iterator                begin()       { return instr_begin();  }
-  const_iterator          begin() const { return instr_begin();  }
-  iterator                end  ()       { return instr_end();    }
-  const_iterator          end  () const { return instr_end();    }
+  iterator begin() { return instr_begin(); }
+  const_iterator begin() const { return instr_begin(); }
+  iterator end() { return instr_end(); }
+  const_iterator end() const { return instr_end(); }
   reverse_iterator rbegin() {
     return reverse_iterator::getAtBundleBegin(instr_rbegin());
   }
@@ -424,38 +432,30 @@ class MachineBasicBlock
       SmallVectorImpl<MachineBasicBlock *>::reverse_iterator;
   using const_succ_reverse_iterator =
       SmallVectorImpl<MachineBasicBlock *>::const_reverse_iterator;
-  pred_iterator        pred_begin()       { return Predecessors.begin(); }
-  const_pred_iterator  pred_begin() const { return Predecessors.begin(); }
-  pred_iterator        pred_end()         { return Predecessors.end();   }
-  const_pred_iterator  pred_end()   const { return Predecessors.end();   }
-  pred_reverse_iterator        pred_rbegin()
-                                          { return Predecessors.rbegin();}
-  const_pred_reverse_iterator  pred_rbegin() const
-                                          { return Predecessors.rbegin();}
-  pred_reverse_iterator        pred_rend()
-                                          { return Predecessors.rend();  }
-  const_pred_reverse_iterator  pred_rend()   const
-                                          { return Predecessors.rend();  }
-  unsigned             pred_size()  const {
-    return (unsigned)Predecessors.size();
-  }
-  bool                 pred_empty() const { return Predecessors.empty(); }
-  succ_iterator        succ_begin()       { return Successors.begin();   }
-  const_succ_iterator  succ_begin() const { return Successors.begin();   }
-  succ_iterator        succ_end()         { return Successors.end();     }
-  const_succ_iterator  succ_end()   const { return Successors.end();     }
-  succ_reverse_iterator        succ_rbegin()
-                                          { return Successors.rbegin();  }
-  const_succ_reverse_iterator  succ_rbegin() const
-                                          { return Successors.rbegin();  }
-  succ_reverse_iterator        succ_rend()
-                                          { return Successors.rend();    }
-  const_succ_reverse_iterator  succ_rend()   const
-                                          { return Successors.rend();    }
-  unsigned             succ_size()  const {
-    return (unsigned)Successors.size();
-  }
-  bool                 succ_empty() const { return Successors.empty();   }
+  pred_iterator pred_begin() { return Predecessors.begin(); }
+  const_pred_iterator pred_begin() const { return Predecessors.begin(); }
+  pred_iterator pred_end() { return Predecessors.end(); }
+  const_pred_iterator pred_end() const { return Predecessors.end(); }
+  pred_reverse_iterator pred_rbegin() { return Predecessors.rbegin(); }
+  const_pred_reverse_iterator pred_rbegin() const {
+    return Predecessors.rbegin();
+  }
+  pred_reverse_iterator pred_rend() { return Predecessors.rend(); }
+  const_pred_reverse_iterator pred_rend() const { return Predecessors.rend(); }
+  unsigned pred_size() const { return (unsigned)Predecessors.size(); }
+  bool pred_empty() const { return Predecessors.empty(); }
+  succ_iterator succ_begin() { return Successors.begin(); }
+  const_succ_iterator succ_begin() const { return Successors.begin(); }
+  succ_iterator succ_end() { return Successors.end(); }
+  const_succ_iterator succ_end() const { return Successors.end(); }
+  succ_reverse_iterator succ_rbegin() { return Successors.rbegin(); }
+  const_succ_reverse_iterator succ_rbegin() const {
+    return Successors.rbegin();
+  }
+  succ_reverse_iterator succ_rend() { return Successors.rend(); }
+  const_succ_reverse_iterator succ_rend() const { return Successors.rend(); }
+  unsigned succ_size() const { return (unsigned)Successors.size(); }
+  bool succ_empty() const { return Successors.empty(); }
 
   inline iterator_range<pred_iterator> predecessors() {
     return make_range(pred_begin(), pred_end());
@@ -528,8 +528,8 @@ class MachineBasicBlock
   }
 
   LLVM_ABI livein_iterator livein_begin() const;
-  livein_iterator livein_end()   const { return LiveIns.end(); }
-  bool            livein_empty() const { return LiveIns.empty(); }
+  livein_iterator livein_end() const { return LiveIns.end(); }
+  bool livein_empty() const { return LiveIns.empty(); }
   iterator_range<livein_iterator> liveins() const {
     return make_range(livein_begin(), livein_end());
   }
@@ -581,13 +581,9 @@ class MachineBasicBlock
       return Tmp;
     }
 
-    reference operator*() const {
-      return *LiveRegI;
-    }
+    reference operator*() const { return *LiveRegI; }
 
-    pointer operator->() const {
-      return &*LiveRegI;
-    }
+    pointer operator->() const { return &*LiveRegI; }
 
     bool operator==(const liveout_iterator &RHS) const {
       if (BlockI != BlockEnd)
@@ -598,6 +594,7 @@ class MachineBasicBlock
     bool operator!=(const liveout_iterator &RHS) const {
       return !(*this == RHS);
     }
+
   private:
     bool advanceToValidPosition() {
       if (LiveRegI != (*BlockI)->livein_end())
@@ -710,6 +707,14 @@ class MachineBasicBlock
 
   std::optional<UniqueBBID> getBBID() const { return BBID; }
 
+  const SmallVector<unsigned> &getPrefetchTargets() const {
+    return PrefetchTargets;
+  }
+
+  void setPrefetchTargets(const SmallVector<unsigned> &V) {
+    PrefetchTargets = V;
+  }
+
   /// Returns the section ID of this basic block.
   MBBSectionID getSectionID() const { return SectionID; }
 
@@ -978,9 +983,7 @@ class MachineBasicBlock
 
   /// Convenience function that returns true if the block ends in a return
   /// instruction.
-  bool isReturnBlock() const {
-    return !empty() && back().isReturn();
-  }
+  bool isReturnBlock() const { return !empty() && back().isReturn(); }
 
   /// Convenience function that returns true if the bock ends in a EH scope
   /// return instruction.
@@ -1057,8 +1060,7 @@ class MachineBasicBlock
   LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M);
 
   /// Insert a range of instructions into the instruction list before I.
-  template<typename IT>
-  void insert(iterator I, IT S, IT E) {
+  template <typename IT> void insert(iterator I, IT S, IT E) {
     assert((I == end() || I->getParent() == this) &&
            "iterator points outside of basic block");
     Insts.insert(I.getInstrIterator(), S, E);
@@ -1116,17 +1118,13 @@ class MachineBasicBlock
   /// Remove an instruction or bundle from the instruction list and delete it.
   ///
   /// If I points to a bundle of instructions, they are all erased.
-  iterator erase(iterator I) {
-    return erase(I, std::next(I));
-  }
+  iterator erase(iterator I) { return erase(I, std::next(I)); }
 
   /// Remove an instruction from the instruction list and delete it.
   ///
   /// If I is the head of a bundle of instructions, the whole bundle will be
   /// erased.
-  iterator erase(MachineInstr *I) {
-    return erase(iterator(I));
-  }
+  iterator erase(MachineInstr *I) { return erase(iterator(I)); }
 
   /// Remove the unbundled instruction from the instruction list without
   /// deleting it.
@@ -1145,9 +1143,7 @@ class MachineBasicBlock
   /// bundle will still be bundled after removing the single instruction.
   LLVM_ABI MachineInstr *remove_instr(MachineInstr *I);
 
-  void clear() {
-    Insts.clear();
-  }
+  void clear() { Insts.clear(); }
 
   /// Take an instruction from MBB 'Other' at the position From, and insert it
   /// into this MBB right before 'Where'.
@@ -1164,8 +1160,8 @@ class MachineBasicBlock
   ///
   /// The instruction at 'Where' must not be included in the range of
   /// instructions to move.
-  void splice(iterator Where, MachineBasicBlock *Other,
-              iterator From, iterator To) {
+  void splice(iterator Where, MachineBasicBlock *Other, iterator From,
+              iterator To) {
     Insts.splice(Where.getInstrIterator(), Other->Insts,
                  From.getInstrIterator(), To.getInstrIterator());
   }
@@ -1251,7 +1247,7 @@ class MachineBasicBlock
                       bool IsStandalone = true) const;
 
   enum PrintNameFlag {
-    PrintNameIr = (1 << 0), ///< Add IR name where available
+    PrintNameIr = (1 << 0),         ///< Add IR name where available
     PrintNameAttributes = (1 << 1), ///< Print attributes
   };
 
@@ -1275,6 +1271,12 @@ class MachineBasicBlock
   /// Return the MCSymbol for this basic block.
   LLVM_ABI MCSymbol *getSymbol() const;
 
+  MCSymbol *getCallInstSymbol(unsigned CallInstNumber) const;
+
+  const SmallVector<MCSymbol *, 4> &getCallInstSymbols() const {
+    return CallInstSymbols;
+  }
+
   /// Return the Windows EH Continuation Symbol for this basic block.
   LLVM_ABI MCSymbol *getEHContSymbol() const;
 
@@ -1282,9 +1284,7 @@ class MachineBasicBlock
     return IrrLoopHeaderWeight;
   }
 
-  void setIrrLoopHeaderWeight(uint64_t Weight) {
-    IrrLoopHeaderWeight = Weight;
-  }
+  void setIrrLoopHeaderWeight(uint64_t Weight) { IrrLoopHeaderWeight = Weight; }
 
   /// Return probability of the edge from this block to MBB. This method should
   /// NOT be called directly, but by using getEdgeProbability method from
@@ -1393,7 +1393,7 @@ static_assert(GraphHasNodeNumbers<const MachineBasicBlock *>,
 // to be when traversing the predecessor edges of a MBB
 // instead of the successor edges.
 //
-template <> struct GraphTraits<Inverse<MachineBasicBlock*>> {
+template <> struct GraphTraits<Inverse<MachineBasicBlock *>> {
   using NodeRef = MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::pred_iterator;
 
@@ -1413,7 +1413,7 @@ template <> struct GraphTraits<Inverse<MachineBasicBlock*>> {
 static_assert(GraphHasNodeNumbers<Inverse<MachineBasicBlock *>>,
               "GraphTraits getNumber() not detected");
 
-template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
+template <> struct GraphTraits<Inverse<const MachineBasicBlock *>> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_pred_iterator;
 
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 4fcb7f36e0238..ab9fe82bc7917 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -78,9 +78,9 @@ class MachineInstr
   /// otherwise easily derivable from the IR text.
   ///
   enum CommentFlag {
-    ReloadReuse = 0x1,    // higher bits are reserved for target dep comments.
+    ReloadReuse = 0x1, // higher bits are reserved for target dep comments.
     NoSchedComment = 0x2,
-    TAsmComments = 0x4    // Target Asm comments should start from this value.
+    TAsmComments = 0x4 // Target Asm comments should start from this value.
   };
 
   enum MIFlag {
@@ -123,16 +123,17 @@ class MachineInstr
     NoUSWrap = 1 << 20,      // Instruction supports geps
                              // no unsigned signed wrap.
     SameSign = 1 << 21,      // Both operands have the same sign.
-    InBounds = 1 << 22       // Pointer arithmetic remains inbounds.
+    InBounds = 1 << 22,      // Pointer arithmetic remains inbounds.
                              // Implies NoUSWrap.
+    Prefetch = 1 << 23,      // Instruction is a prefetch.
   };
 
 private:
-  const MCInstrDesc *MCID;              // Instruction descriptor.
-  MachineBasicBlock *Parent = nullptr;  // Pointer to the owning basic block.
+  const MCInstrDesc *MCID;             // Instruction descriptor.
+  MachineBasicBlock *Parent = nullptr; // Pointer to the owning basic block.
 
   // Operands are allocated by an ArrayRecycler.
-  MachineOperand *Operands = nullptr;   // Pointer to the first operand.
+  MachineOperand *Operands = nullptr; // Pointer to the first operand.
 
 #define LLVM_MI_NUMOPERANDS_BITS 24
 #define LLVM_MI_FLAGS_BITS 24
@@ -144,7 +145,7 @@ class MachineInstr
   // OperandCapacity has uint8_t size, so it should be next to NumOperands
   // to properly pack.
   using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity;
-  OperandCapacity CapOperands;          // Capacity of the Operands array.
+  OperandCapacity CapOperands; // Capacity of the Operands array.
 
   /// Various bits of additional information about the machine instruction.
   uint32_t Flags : LLVM_MI_FLAGS_BITS;
@@ -226,9 +227,8 @@ class MachineInstr
     }
 
     MDNode *getPCSections() const {
-      return HasPCSections
-                 ? getTrailingObjects<MDNode *>()[HasHeapAllocMarker]
-                 : nullptr;
+      return HasPCSections ? getTrailingObjects<MDNode *>()[HasHeapAllocMarker]
+                           : nullptr;
     }
 
     uint32_t getCFIType() const {
@@ -356,8 +356,8 @@ class MachineInstr
   // Use MachineFunction::DeleteMachineInstr() instead.
   ~MachineInstr() = delete;
 
-  const MachineBasicBlock* getParent() const { return Parent; }
-  MachineBasicBlock* getParent() { return Parent; }
+  const MachineBasicBlock *getParent() const { return Parent; }
+  MachineBasicBlock *getParent() { return Parent; }
 
   /// Move the instruction before \p MovePos.
   LLVM_ABI void moveBefore(MachineInstr *MovePos);
@@ -401,9 +401,7 @@ class MachineInstr
   }
 
   /// Return the MI flags bitvector.
-  uint32_t getFlags() const {
-    return Flags;
-  }
+  uint32_t getFlags() const { return Flags; }
 
   /// Return whether an MI flag is set.
   bool getFlag(MIFlag Flag) const {
@@ -475,15 +473,11 @@ class MachineInstr
   ///   ----------------
   /// The first instruction has the special opcode "BUNDLE". It's not "inside"
   /// a bundle, but the next three MIs are.
-  bool isInsideBundle() const {
-    return getFlag(BundledPred);
-  }
+  bool isInsideBundle() const { return getFlag(BundledPred); }
 
   /// Return true if this instruction part of a bundle. This is true
   /// if either itself or its following instruction is marked "InsideBundle".
-  bool isBundled() const {
-    return isBundledWithPred() || isBundledWithSucc();
-  }
+  bool isBundled() const { return isBundledWithPred() || isBundledWithSucc(); }
 
   /// Return true if this instruction is part of a bundle, and it is not the
   /// first instruction in the bundle.
@@ -882,9 +876,9 @@ class MachineInstr
   /// queries but they are bundle aware.
 
   enum QueryType {
-    IgnoreBundle,    // Ignore bundles
-    AnyInBundle,     // Return true if any instruction in bundle has property
-    AllInBundle      // Return true if all instructions in bundle have property
+    IgnoreBundle, // Ignore bundles
+    AnyInBundle,  // Return true if any instruction in bundle has property
+    AllInBundle   // Return true if all instructions in bundle have property
   };
 
   /// Return true if the instruction (or in the case of a bundle,
@@ -1010,8 +1004,8 @@ class MachineInstr
   /// values.   There are various methods in TargetInstrInfo that can be used to
   /// control and modify the predicate in this instruction.
   bool isPredicable(QueryType Type = AllInBundle) const {
-    // If it's a bundle than all bundled instructions must be predicable for this
-    // to return true.
+    // If it's a bundle than all bundled instructions must be predicable for
+    // this to return true.
     return hasProperty(MCID::Predicable, Type);
   }
 
@@ -1269,10 +1263,10 @@ class MachineInstr
   }
 
   enum MICheckType {
-    CheckDefs,      // Check all operands for equality
-    CheckKillDead,  // Check all operands including kill / dead markers
-    IgnoreDefs,     // Ignore all definitions
-    IgnoreVRegDefs  // Ignore virtual register definitions
+    CheckDefs,     // Check all operands for equality
+    CheckKillDead, // Check all operands including kill / dead markers
+    IgnoreDefs,    // Ignore all definitions
+    IgnoreVRegDefs // Ignore virtual register definitions
   };
 
   /// Return true if this instruction is identical to \p Other.
@@ -1399,7 +1393,9 @@ class MachineInstr
            getOpcode() == TargetOpcode::G_PHI;
   }
   bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
-  bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
+  bool isImplicitDef() const {
+    return getOpcode() == TargetOpcode::IMPLICIT_DEF;
+  }
   bool isInlineAsm() const {
     return getOpcode() == TargetOpcode::INLINEASM ||
            getOpcode() == TargetOpcode::INLINEASM_BR;
@@ -1424,13 +1420,9 @@ class MachineInstr
     return getOpcode() == TargetOpcode::REG_SEQUENCE;
   }
 
-  bool isBundle() const {
-    return getOpcode() == TargetOpcode::BUNDLE;
-  }
+  bool isBundle() const { return getOpcode() == TargetOpcode::BUNDLE; }
 
-  bool isCopy() const {
-    return getOpcode() == TargetOpcode::COPY;
-  }
+  bool isCopy() const { return getOpcode() == TargetOpcode::COPY; }
 
   bool isFullCopy() const {
     return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg();
@@ -1444,14 +1436,12 @@ class MachineInstr
 
   /// Return true if the instruction behaves like a copy.
   /// This does not include native copy instructions.
-  bool isCopyLike() const {
-    return isCopy() || isSubregToReg();
-  }
+  bool isCopyLike() const { return isCopy() || isSubregToReg(); }
 
   /// Return true is the instruction is an identity copy.
   bool isIdentityCopy() const {
     return isCopy() && getOperand(0).getReg() == getOperand(1).getReg() &&
-      getOperand(0).getSubReg() == getOperand(1).getSubReg();
+           getOperand(0).getSubReg() == getOperand(1).getSubReg();
   }
 
   /// Return true if this is a transient instruction that is either very likely
@@ -2067,12 +2057,12 @@ class MachineInstr
   /// Unlink all of the register operands in this instruction from their
   /// respective use lists.  This requires that the operands already be on their
   /// use lists.
-  void removeRegOperandsFromUseLists(MachineRegisterInfo&);
+  void removeRegOperandsFromUseLists(MachineRegisterInfo &);
 
   /// Add all of the register operands in this instruction from their
   /// respective use lists.  This requires that the operands not be on their
   /// use lists yet.
-  void addRegOperandsToUseLists(MachineRegisterInfo&);
+  void addRegOperandsToUseLists(MachineRegisterInfo &);
 
   /// Slow path for hasProperty when we're dealing with a bundle.
   LLVM_ABI bool hasPropertyInBundle(uint64_t Mask, QueryType Type) const;
@@ -2096,19 +2086,17 @@ class MachineInstr
 /// instruction rather than by pointer value.
 /// The hashing and equality testing functions ignore definitions so this is
 /// useful for CSE, etc.
-struct MachineInstrExpressionTrait : DenseMapInfo<MachineInstr*> {
-  static inline MachineInstr *getEmptyKey() {
-    return nullptr;
-  }
+struct MachineInstrExpressionTrait : DenseMapInfo<MachineInstr *> {
+  static inline MachineInstr *getEmptyKey() { return nullptr; }
 
   static inline MachineInstr *getTombstoneKey() {
-    return reinterpret_cast<MachineInstr*>(-1);
+    return reinterpret_cast<MachineInstr *>(-1);
   }
 
   LLVM_ABI static unsigned getHashValue(const MachineInstr *const &MI);
 
-  static bool isEqual(const MachineInstr* const &LHS,
-                      const MachineInstr* const &RHS) {
+  static bool isEqual(const MachineInstr *const &LHS,
+                      const MachineInstr *const &RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
         LHS == getEmptyKey() || LHS == getTombstoneKey())
       return LHS == RHS;
@@ -2119,7 +2107,7 @@ struct MachineInstrExpressionTrait : DenseMapInfo<MachineInstr*> {
 //===----------------------------------------------------------------------===//
 // Debugging Support
 
-inline raw_ostream& operator<<(raw_ostream &OS, const MachineInstr &MI) {
+inline raw_ostream &operator<<(raw_ostream &OS, const MachineInstr &MI) {
   MI.print(OS);
   return OS;
 }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 581b4ad161daa..1ae4b76adc92f 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -56,6 +56,7 @@ LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
 LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &);
 LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &);
 LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &);
+LLVM_ABI void initializePrefetchInsertionPass(PassRegistry &);
 LLVM_ABI void
 initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeBasicBlockSectionsPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8aa488f0efd8f..363e9a61ef1b9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -119,6 +119,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -178,6 +179,11 @@ static cl::opt<bool> EmitJumpTableSizesSection(
     cl::desc("Emit a section containing jump table addresses and sizes"),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> InsertNoopsForPrefetch(
+    "insert-noops-for-prefetch",
+    cl::desc("Whether to insert noops instead of prefetches."), cl::init(false),
+    cl::Hidden);
+
 // This isn't turned on by default, since several of the scheduling models are
 // not completely accurate, and we don't want to be misleading.
 static cl::opt<bool> PrintLatency(
@@ -199,9 +205,7 @@ class AddrLabelMapCallbackPtr final : CallbackVH {
   AddrLabelMapCallbackPtr() = default;
   AddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
 
-  void setPtr(BasicBlock *BB) {
-    ValueHandleBase::operator=(BB);
-  }
+  void setPtr(BasicBlock *BB) { ValueHandleBase::operator=(BB); }
 
   void setMap(AddrLabelMap *map) { Map = map; }
 
@@ -639,7 +643,8 @@ bool AsmPrinter::doInitialization(Module &M) {
     break;
   case ExceptionHandling::WinEH:
     switch (MAI->getWinEHEncodingType()) {
-    default: llvm_unreachable("unsupported unwinding information encoding");
+    default:
+      llvm_unreachable("unsupported unwinding information encoding");
     case WinEH::EncodingType::Invalid:
       break;
     case WinEH::EncodingType::X86:
@@ -697,7 +702,7 @@ void AsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
     } else if (MAI->avoidWeakIfComdat() && GV->hasComdat()) {
       // .globl _foo
       OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
-      //NOTE: linkonce is handled by the section the symbol was assigned to.
+      // NOTE: linkonce is handled by the section the symbol was assigned to.
     } else {
       // .weak _foo
       OutStreamer->emitSymbolAttribute(GVSym, MCSA_Weak);
@@ -734,7 +739,8 @@ MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
   // assembler would otherwise be conservative and assume a global default
   // visibility symbol can be interposable, even if the code generator already
   // assumed it.
-  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias()) {
+  if (TM.getTargetTriple().isOSBinFormatELF() &&
+      GV.canBenefitFromLocalAlias()) {
     const Module &M = *GV.getParent();
     if (TM.getRelocationModel() != Reloc::Static &&
         M.getPIELevel() == PIELevel::Default && GV.isDSOLocal())
@@ -791,7 +797,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     OutStreamer->emitSymbolAttribute(EmittedSym, MCSA_Memtag);
   }
 
-  if (!GV->hasInitializer())   // External globals require no extra code.
+  if (!GV->hasInitializer()) // External globals require no extra code.
     return;
 
   GVSym->redefineIfPossible();
@@ -817,7 +823,8 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
   // Handle common symbols
   if (GVKind.isCommon()) {
-    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+    if (Size == 0)
+      Size = 1; // .comm Foo, 0 is undefined, avoid it.
     // .comm _foo, 42, 4
     OutStreamer->emitCommonSymbol(GVSym, Size, Alignment);
     return;
@@ -887,8 +894,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
       emitAlignment(Alignment, GV);
       OutStreamer->emitLabel(MangSym);
 
-      emitGlobalConstant(GV->getDataLayout(),
-                         GV->getInitializer());
+      emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
     }
 
     OutStreamer->addBlankLine();
@@ -907,7 +913,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     //   - pointer to mangled symbol above with initializer
     unsigned PtrSize = DL.getPointerTypeSize(GV->getType());
     OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
-                                PtrSize);
+                                 PtrSize);
     OutStreamer->emitIntValue(0, PtrSize);
     OutStreamer->emitSymbolValue(MangSym, PtrSize);
 
@@ -1063,7 +1069,7 @@ void AsmPrinter::emitFunctionHeader() {
   // If the function had address-taken blocks that got deleted, then we have
   // references to the dangling symbols.  Emit them at the start of the function
   // so that we don't get references to undefined symbols.
-  std::vector<MCSymbol*> DeadBlockSyms;
+  std::vector<MCSymbol *> DeadBlockSyms;
   takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
   for (MCSymbol *DeadBlockSym : DeadBlockSyms) {
     OutStreamer->AddComment("Address taken block that was later removed");
@@ -1075,7 +1081,7 @@ void AsmPrinter::emitFunctionHeader() {
       MCSymbol *CurPos = OutContext.createTempSymbol();
       OutStreamer->emitLabel(CurPos);
       OutStreamer->emitAssignment(CurrentFnBegin,
-                                 MCSymbolRefExpr::create(CurPos, OutContext));
+                                  MCSymbolRefExpr::create(CurPos, OutContext));
     } else {
       OutStreamer->emitLabel(CurrentFnBegin);
     }
@@ -1394,7 +1400,7 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
 
   // Emit a symbol assignment.
   OutStreamer->emitAssignment(FrameAllocSym,
-                             MCConstantExpr::create(FrameOffset, OutContext));
+                              MCConstantExpr::create(FrameOffset, OutContext));
 }
 
 /// Returns the BB metadata to be emitted in the SHT_LLVM_BB_ADDR_MAP section
@@ -1983,10 +1989,38 @@ void AsmPrinter::emitFunctionBody() {
   FunctionCallGraphInfo FuncCGInfo;
   const auto &CallSitesInfoMap = MF->getCallSitesInfo();
   for (auto &MBB : *MF) {
+    int NextPrefetchTargetIndex = MBB.getPrefetchTargets().empty() ? -1 : 0;
     // Print a label for the basic block.
     emitBasicBlockStart(MBB);
     DenseMap<StringRef, unsigned> MnemonicCounts;
+    unsigned NumCallsInBlock = 0;
     for (auto &MI : MBB) {
+      if (NextPrefetchTargetIndex != -1 &&
+          NumCallsInBlock >=
+              MBB.getPrefetchTargets()[NextPrefetchTargetIndex]) {
+
+        MCSymbol *PrefetchTargetSymbol = OutContext.getOrCreateSymbol(
+            Twine("__llvm_prefetch_target_") + MF->getName() + Twine("_") +
+            utostr(MBB.getBBID()->BaseID) + Twine("_") +
+            utostr(MBB.getPrefetchTargets()[NextPrefetchTargetIndex]));
+        if (MF->getFunction().isWeakForLinker()) {
+          OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol, MCSA_Weak);
+          errs() << "Emitting weak symbol: " << PrefetchTargetSymbol->getName()
+                 << "\n";
+        } else {
+          OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol, MCSA_Global);
+          errs() << "Emitting global symbol: "
+                 << PrefetchTargetSymbol->getName() << "\n";
+        }
+        // OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol, MCSA_Extern);
+        // errs() << "Emitting symbol: " << PrefetchTargetSymbol->getName() <<
+        // "\n";
+        OutStreamer->emitLabel(PrefetchTargetSymbol);
+        ++NextPrefetchTargetIndex;
+        if (NextPrefetchTargetIndex >=
+            static_cast<int>(MBB.getPrefetchTargets().size()))
+          NextPrefetchTargetIndex = -1;
+      }
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
           !MI.isDebugInstr()) {
@@ -2061,10 +2095,12 @@ void AsmPrinter::emitFunctionBody() {
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) emitImplicitDef(&MI);
+        if (isVerbose())
+          emitImplicitDef(&MI);
         break;
       case TargetOpcode::KILL:
-        if (isVerbose()) emitKill(&MI, *this);
+        if (isVerbose())
+          emitKill(&MI, *this);
         break;
       case TargetOpcode::FAKE_USE:
         if (isVerbose())
@@ -2089,7 +2125,11 @@ void AsmPrinter::emitFunctionBody() {
         // actual initialization is needed.
         break;
       default:
-        emitInstruction(&MI);
+        if (MI.getFlag(MachineInstr::Prefetch) && InsertNoopsForPrefetch) {
+          OutStreamer->emitNops(7, 7, SMLoc(), getSubtargetInfo());
+        } else {
+          emitInstruction(&MI);
+        }
 
         auto CountInstruction = [&](const MachineInstr &MI) {
           // Skip Meta instructions inside bundles.
@@ -2126,6 +2166,24 @@ void AsmPrinter::emitFunctionBody() {
       for (auto &Handler : Handlers)
         Handler->endInstruction();
     }
+    while (NextPrefetchTargetIndex != -1) {
+      MCSymbol *PrefetchTargetSymbol = OutContext.getOrCreateSymbol(
+          Twine("__llvm_prefetch_target_") + MF->getName() + Twine("_") +
+          utostr(MBB.getBBID()->BaseID) + Twine("_") +
+          utostr(MBB.getPrefetchTargets()[NextPrefetchTargetIndex]));
+      if (MF->getFunction().hasWeakLinkage()) {
+        OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol,
+                                         MCSA_WeakDefinition);
+      } else {
+        OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol, MCSA_Global);
+      }
+      OutStreamer->emitSymbolAttribute(PrefetchTargetSymbol, MCSA_Extern);
+      OutStreamer->emitLabel(PrefetchTargetSymbol);
+      ++NextPrefetchTargetIndex;
+      if (NextPrefetchTargetIndex >=
+          static_cast<int>(MBB.getPrefetchTargets().size()))
+        NextPrefetchTargetIndex = -1;
+    }
 
     // We must emit temporary symbol for the end of this basic block, if either
     // we have BBLabels enabled or if this basic blocks marks the end of a
@@ -2865,7 +2923,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
-  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
+  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E;)
     if (GCMetadataPrinter *MP = getOrCreateGCPrinter(**--I))
       MP->finishAssembly(M, *MI, *this);
 
@@ -3013,13 +3071,13 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
 namespace {
 
 // Keep track the alignment, constpool entries per Section.
-  struct SectionCPs {
-    MCSection *S;
-    Align Alignment;
-    SmallVector<unsigned, 4> CPEs;
+struct SectionCPs {
+  MCSection *S;
+  Align Alignment;
+  SmallVector<unsigned, 4> CPEs;
 
-    SectionCPs(MCSection *s, Align a) : S(s), Alignment(a) {}
-  };
+  SectionCPs(MCSection *s, Align a) : S(s), Alignment(a) {}
+};
 
 } // end anonymous namespace
 
@@ -3037,7 +3095,8 @@ StringRef AsmPrinter::getConstantSectionSuffix(const Constant *C) const {
 void AsmPrinter::emitConstantPool() {
   const MachineConstantPool *MCP = MF->getConstantPool();
   const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
-  if (CP.empty()) return;
+  if (CP.empty())
+    return;
 
   // Calculate sections for constant pool entries. We collect entries to go into
   // the same section together to reduce amount of section switch statements.
@@ -3112,10 +3171,12 @@ void AsmPrinter::emitConstantPool() {
 // function.
 void AsmPrinter::emitJumpTableInfo() {
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (!MJTI) return;
+  if (!MJTI)
+    return;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  if (JT.empty()) return;
+  if (JT.empty())
+    return;
 
   if (!TM.Options.EnableStaticDataPartitioning) {
     emitJumpTableImpl(*MJTI, llvm::to_vector(llvm::seq<unsigned>(JT.size())));
@@ -3333,7 +3394,7 @@ void AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo &MJTI,
 /// do nothing and return false.
 bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
   if (GV->getName() == "llvm.used") {
-    if (MAI->hasNoDeadStrip())    // No need to emit this at all.
+    if (MAI->hasNoDeadStrip()) // No need to emit this at all.
       emitLLVMUsedList(cast<ConstantArray>(GV->getInitializer()));
     return true;
   }
@@ -3376,7 +3437,8 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
     return true;
   }
 
-  if (!GV->hasAppendingLinkage()) return false;
+  if (!GV->hasAppendingLinkage())
+    return false;
 
   assert(GV->hasInitializer() && "Not a special LLVM global!");
 
@@ -3406,7 +3468,7 @@ void AsmPrinter::emitLLVMUsedList(const ConstantArray *InitList) {
   // Should be an array of 'i8*'.
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     const GlobalValue *GV =
-      dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
+        dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
     if (GV)
       OutStreamer->emitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
   }
@@ -3820,7 +3882,8 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {
   assert(!Data.empty() && "Empty aggregates should be CAZ node");
   char C = Data[0];
   for (unsigned i = 1, e = Data.size(); i != e; ++i)
-    if (Data[i] != C) return -1;
+    if (Data[i] != C)
+      return -1;
   return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
 }
 
@@ -3965,7 +4028,8 @@ static void emitGlobalConstantVector(const DataLayout &DL, const Constant *CV,
     EmittedSize = DL.getTypeStoreSize(CV->getType());
   } else {
     for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
-      emitGlobalAliasInline(AP, DL.getTypeAllocSize(CV->getType()) * I, AliasList);
+      emitGlobalAliasInline(AP, DL.getTypeAllocSize(CV->getType()) * I,
+                            AliasList);
       emitGlobalConstantImpl(DL, CV->getAggregateElement(I), AP);
     }
     EmittedSize = DL.getTypeAllocSize(ElementType) * VTy->getNumElements();
@@ -4083,8 +4147,8 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
       // ExtraBits     0       1       (BitWidth / 64) - 1
       //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
       ExtraBitsSize = alignTo(ExtraBitsSize, 8);
-      ExtraBits = Realigned.getRawData()[0] &
-        (((uint64_t)-1) >> (64 - ExtraBitsSize));
+      ExtraBits =
+          Realigned.getRawData()[0] & (((uint64_t)-1) >> (64 - ExtraBitsSize));
       if (BitWidth >= 64)
         Realigned.lshrInPlace(ExtraBitsSize);
     } else
@@ -4107,8 +4171,9 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     uint64_t Size = AP.getDataLayout().getTypeStoreSize(CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
-           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
-           == ExtraBits && "Directive too small for extra bits.");
+           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize))) ==
+               ExtraBits &&
+           "Directive too small for extra bits.");
     AP.OutStreamer->emitIntValue(ExtraBits, Size);
   }
 }
@@ -4430,12 +4495,13 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(const Twine &Sym) const {
 /// PrintParentLoopComment - Print comments about parent loops of this one.
 static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                    unsigned FunctionNumber) {
-  if (!Loop) return;
+  if (!Loop)
+    return;
   PrintParentLoopComment(OS, Loop->getParentLoop(), FunctionNumber);
-  OS.indent(Loop->getLoopDepth()*2)
-    << "Parent Loop BB" << FunctionNumber << "_"
-    << Loop->getHeader()->getNumber()
-    << " Depth=" << Loop->getLoopDepth() << '\n';
+  OS.indent(Loop->getLoopDepth() * 2)
+      << "Parent Loop BB" << FunctionNumber << "_"
+      << Loop->getHeader()->getNumber() << " Depth=" << Loop->getLoopDepth()
+      << '\n';
 }
 
 /// PrintChildLoopComment - Print comments about child loops within
@@ -4444,10 +4510,10 @@ static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                   unsigned FunctionNumber) {
   // Add child loop information
   for (const MachineLoop *CL : *Loop) {
-    OS.indent(CL->getLoopDepth()*2)
-      << "Child Loop BB" << FunctionNumber << "_"
-      << CL->getHeader()->getNumber() << " Depth " << CL->getLoopDepth()
-      << '\n';
+    OS.indent(CL->getLoopDepth() * 2)
+        << "Child Loop BB" << FunctionNumber << "_"
+        << CL->getHeader()->getNumber() << " Depth " << CL->getLoopDepth()
+        << '\n';
     PrintChildLoopComment(OS, CL, FunctionNumber);
   }
 }
@@ -4458,7 +4524,8 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
                                        const AsmPrinter &AP) {
   // Add loop depth information
   const MachineLoop *Loop = LI->getLoopFor(&MBB);
-  if (!Loop) return;
+  if (!Loop)
+    return;
 
   MachineBasicBlock *Header = Loop->getHeader();
   assert(Header && "No header for loop");
@@ -4467,9 +4534,9 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   // and return.
   if (Header != &MBB) {
     AP.OutStreamer->AddComment("  in Loop: Header=BB" +
-                               Twine(AP.getFunctionNumber())+"_" +
-                               Twine(Loop->getHeader()->getNumber())+
-                               " Depth="+Twine(Loop->getLoopDepth()));
+                               Twine(AP.getFunctionNumber()) + "_" +
+                               Twine(Loop->getHeader()->getNumber()) +
+                               " Depth=" + Twine(Loop->getLoopDepth()));
     return;
   }
 
@@ -4480,7 +4547,7 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   PrintParentLoopComment(OS, Loop->getParentLoop(), AP.getFunctionNumber());
 
   OS << "=>";
-  OS.indent(Loop->getLoopDepth()*2-2);
+  OS.indent(Loop->getLoopDepth() * 2 - 2);
 
   OS << "This ";
   if (Loop->isInnermost())
@@ -4601,7 +4668,8 @@ void AsmPrinter::emitVisibility(MCSymbol *Sym, unsigned Visibility,
   MCSymbolAttr Attr = MCSA_Invalid;
 
   switch (Visibility) {
-  default: break;
+  default:
+    break;
   case GlobalValue::HiddenVisibility:
     if (IsDefinition)
       Attr = MAI->getHiddenVisibilityAttr();
@@ -4636,8 +4704,8 @@ bool AsmPrinter::shouldEmitLabelForBasicBlock(
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
 /// exactly one predecessor and the control transfer mechanism between
 /// the predecessor and this block is a fall-through.
-bool AsmPrinter::
-isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+bool AsmPrinter::isBlockOnlyReachableByFallthrough(
+    const MachineBasicBlock *MBB) const {
   // If this is a landing pad, it isn't a fall through.  If it has no preds,
   // then nothing falls through to it.
   if (MBB->isEHPad() || MBB->pred_empty())
@@ -4840,7 +4908,7 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
   auto Attr = F.getFnAttribute("function-instrument");
   bool LogArgs = F.hasFnAttribute("xray-log-args");
   bool AlwaysInstrument =
-    Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+      Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
   if (Kind == SledKind::FUNCTION_ENTER && LogArgs)
     Kind = SledKind::LOG_ARGS_ENTER;
   Sleds.emplace_back(XRayFunctionEntry{Sled, CurrentFnSym, Kind,
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index e317e1c06741f..89bfa8a5ebb6f 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -106,7 +106,8 @@ class BasicBlockSections : public MachineFunctionPass {
 public:
   static char ID;
 
-  BasicBlockSectionsProfileReaderWrapperPass *BBSectionsProfileReader = nullptr;
+  // BasicBlockSectionsProfileReaderWrapperPass *BBSectionsProfileReader =
+  // nullptr;
 
   BasicBlockSections() : MachineFunctionPass(ID) {
     initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
@@ -305,8 +306,7 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) {
   // clusters of basic blocks using basic block ids. Source drift can
   // invalidate these groupings leading to sub-optimal code generation with
   // regards to performance.
-  if (BBSectionsType == BasicBlockSection::List &&
-      hasInstrProfHashMismatch(MF))
+  if (BBSectionsType == BasicBlockSection::List && hasInstrProfHashMismatch(MF))
     return false;
   // Renumber blocks before sorting them. This is useful for accessing the
   // original layout positions and finding the original fallthroughs.
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index fbcd614b85d18..3df7452ac1f0a 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -65,9 +65,9 @@ std::pair<bool, SmallVector<BBClusterInfo>>
 BasicBlockSectionsProfileReader::getClusterInfoForFunction(
     StringRef FuncName) const {
   auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
-  return R != ProgramPathAndClusterInfo.end()
-             ? std::pair(true, R->second.ClusterInfo)
-             : std::pair(false, SmallVector<BBClusterInfo>());
+  if (R == ProgramPathAndClusterInfo.end() || R->second.ClusterInfo.empty())
+    return std::pair(false, SmallVector<BBClusterInfo>());
+  return std::pair(true, R->second.ClusterInfo);
 }
 
 SmallVector<SmallVector<unsigned>>
@@ -91,6 +91,19 @@ uint64_t BasicBlockSectionsProfileReader::getEdgeCount(
   return EdgeIt->second;
 }
 
+SmallVector<PrefetchHint>
+BasicBlockSectionsProfileReader::getPrefetchHintsForFunction(
+    StringRef FuncName) const {
+  return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).PrefetchHints;
+}
+
+DenseSet<BBPosition>
+BasicBlockSectionsProfileReader::getPrefetchTargetsForFunction(
+    StringRef FuncName) const {
+  return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName))
+      .PrefetchTargets;
+}
+
 // Reads the version 1 basic block sections profile. Profile for each function
 // is encoded as follows:
 //   m <module_name>
@@ -287,6 +300,61 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
       }
       continue;
     }
+    case 'h': { // Prefetch hint specifier.
+      // Skip the profile when we the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramPathAndClusterInfo.end())
+        continue;
+      assert(Values.size() == 2);
+      SmallVector<StringRef, 2> PrefetchSiteStr;
+      Values[0].split(PrefetchSiteStr, '@');
+      assert(PrefetchSiteStr.size() == 2);
+      auto SiteBBID = parseUniqueBBID(PrefetchSiteStr[0]);
+      if (!SiteBBID)
+        return SiteBBID.takeError();
+      unsigned long long SiteBBOffset;
+      if (getAsUnsignedInteger(PrefetchSiteStr[1], 10, SiteBBOffset))
+        return createProfileParseError(Twine("unsigned integer expected: '") +
+                                       PrefetchSiteStr[1]);
+
+      SmallVector<StringRef, 3> PrefetchTargetStr;
+      Values[1].split(PrefetchTargetStr, '@');
+      assert(PrefetchTargetStr.size() == 3);
+      auto TargetBBID = parseUniqueBBID(PrefetchTargetStr[1]);
+      if (!TargetBBID)
+        return TargetBBID.takeError();
+      unsigned long long TargetBBOffset;
+      if (getAsUnsignedInteger(PrefetchTargetStr[2], 10, TargetBBOffset))
+        return createProfileParseError(Twine("unsigned integer expected: '") +
+                                       PrefetchTargetStr[2]);
+      // errs() << "Read it " << " " << SiteBBOffset << " " <<
+      // PrefetchTargetStr[0] << " " <<TargetBBOffset << "\n";
+      FI->second.PrefetchHints.push_back(
+          PrefetchHint{{*SiteBBID, static_cast<unsigned>(SiteBBOffset)},
+                       PrefetchTargetStr[0],
+                       {*TargetBBID, static_cast<unsigned>(TargetBBOffset)}});
+      continue;
+    }
+    case 't': { // Prefetch target specifier.
+      // Skip the profile when we the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramPathAndClusterInfo.end())
+        continue;
+      assert(Values.size() == 1);
+      SmallVector<StringRef, 2> PrefetchTargetStr;
+      Values[0].split(PrefetchTargetStr, '@');
+      assert(PrefetchTargetStr.size() == 2);
+      auto TargetBBID = parseUniqueBBID(PrefetchTargetStr[0]);
+      if (!TargetBBID)
+        return TargetBBID.takeError();
+      unsigned long long TargetBBOffset;
+      if (getAsUnsignedInteger(PrefetchTargetStr[1], 10, TargetBBOffset))
+        return createProfileParseError(Twine("unsigned integer expected: '") +
+                                       PrefetchTargetStr[1]);
+      FI->second.PrefetchTargets.insert(
+          BBPosition{*TargetBBID, static_cast<unsigned>(TargetBBOffset)});
+      continue;
+    }
     default:
       return createProfileParseError(Twine("invalid specifier: '") +
                                      Twine(Specifier) + "'");
@@ -493,6 +561,18 @@ uint64_t BasicBlockSectionsProfileReaderWrapperPass::getEdgeCount(
   return BBSPR.getEdgeCount(FuncName, SrcBBID, SinkBBID);
 }
 
+SmallVector<PrefetchHint>
+BasicBlockSectionsProfileReaderWrapperPass::getPrefetchHintsForFunction(
+    StringRef FuncName) const {
+  return BBSPR.getPrefetchHintsForFunction(FuncName);
+}
+
+DenseSet<BBPosition>
+BasicBlockSectionsProfileReaderWrapperPass::getPrefetchTargetsForFunction(
+    StringRef FuncName) const {
+  return BBSPR.getPrefetchTargetsForFunction(FuncName);
+}
+
 BasicBlockSectionsProfileReader &
 BasicBlockSectionsProfileReaderWrapperPass::getBBSPR() {
   return BBSPR;
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8ea132626a5af..a6245bfe5c475 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/FloatingPointPredicateUtils.h"
@@ -368,8 +369,8 @@ class CodeGenPrepare {
   std::unique_ptr<DominatorTree> DT;
 
 public:
-  CodeGenPrepare(){};
-  CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
+  CodeGenPrepare() {};
+  CodeGenPrepare(const TargetMachine *TM) : TM(TM) {};
   /// If encounter huge function, we need to limit the build time.
   bool IsHugeFunc = false;
 
@@ -3837,11 +3838,11 @@ class AddressingModeMatcher {
       TypePromotionTransaction &TPT,
       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
-      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
-        DL(MI->getDataLayout()), LI(LI), getDTFn(getDTFn),
-        AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
-        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
-        LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
+      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getDataLayout()), LI(LI),
+        getDTFn(getDTFn), AccessTy(AT), AddrSpace(AS), MemoryInst(MI),
+        AddrMode(AM), InsertedInsts(InsertedInsts),
+        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
+        OptSize(OptSize), PSI(PSI), BFI(BFI) {
     IgnoreProfitability = false;
   }
 
@@ -4478,8 +4479,8 @@ class AddressingModeCombiner {
         // It must be a Phi node then.
         PHINode *CurrentPhi = cast<PHINode>(Current);
         unsigned PredCount = CurrentPhi->getNumIncomingValues();
-        PHINode *PHI =
-            PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi->getIterator());
+        PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
+                                       CurrentPhi->getIterator());
         Map[Current] = PHI;
         ST.insertNewPhi(PHI);
         append_range(Worklist, CurrentPhi->incoming_values());
@@ -4791,7 +4792,7 @@ class TypePromotionHelper {
 
 public:
   /// Type for the utility function that promotes the operand of Ext.
-  using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
+  using Action = Value *(*)(Instruction * Ext, TypePromotionTransaction &TPT,
                             InstrToOrigTy &PromotedInsts,
                             unsigned &CreatedInstsCost,
                             SmallVectorImpl<Instruction *> *Exts,
@@ -5176,9 +5177,9 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     // Try to match an integer constant second to increase its chance of ending
     // up in `BaseOffs`, resp. decrease its chance of ending up in `BaseReg`.
     int First = 0, Second = 1;
-    if (isa<ConstantInt>(AddrInst->getOperand(First))
-      && !isa<ConstantInt>(AddrInst->getOperand(Second)))
-        std::swap(First, Second);
+    if (isa<ConstantInt>(AddrInst->getOperand(First)) &&
+        !isa<ConstantInt>(AddrInst->getOperand(Second)))
+      std::swap(First, Second);
     AddrMode.InBounds = false;
     if (matchAddr(AddrInst->getOperand(First), Depth + 1) &&
         matchAddr(AddrInst->getOperand(Second), Depth + 1))
@@ -5262,32 +5263,32 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     if (VariableOperand == -1) {
       AddrMode.BaseOffs += ConstantOffset;
       if (matchAddr(AddrInst->getOperand(0), Depth + 1)) {
-          if (!cast<GEPOperator>(AddrInst)->isInBounds())
-            AddrMode.InBounds = false;
-          return true;
+        if (!cast<GEPOperator>(AddrInst)->isInBounds())
+          AddrMode.InBounds = false;
+        return true;
       }
       AddrMode.BaseOffs -= ConstantOffset;
 
       if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
           TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
           ConstantOffset > 0) {
-          // Record GEPs with non-zero offsets as candidates for splitting in
-          // the event that the offset cannot fit into the r+i addressing mode.
-          // Simple and common case that only one GEP is used in calculating the
-          // address for the memory access.
-          Value *Base = AddrInst->getOperand(0);
-          auto *BaseI = dyn_cast<Instruction>(Base);
-          auto *GEP = cast<GetElementPtrInst>(AddrInst);
-          if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
-              (BaseI && !isa<CastInst>(BaseI) &&
-               !isa<GetElementPtrInst>(BaseI))) {
-            // Make sure the parent block allows inserting non-PHI instructions
-            // before the terminator.
-            BasicBlock *Parent = BaseI ? BaseI->getParent()
-                                       : &GEP->getFunction()->getEntryBlock();
-            if (!Parent->getTerminator()->isEHPad())
+        // Record GEPs with non-zero offsets as candidates for splitting in
+        // the event that the offset cannot fit into the r+i addressing mode.
+        // Simple and common case that only one GEP is used in calculating the
+        // address for the memory access.
+        Value *Base = AddrInst->getOperand(0);
+        auto *BaseI = dyn_cast<Instruction>(Base);
+        auto *GEP = cast<GetElementPtrInst>(AddrInst);
+        if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
+            (BaseI && !isa<CastInst>(BaseI) &&
+             !isa<GetElementPtrInst>(BaseI))) {
+          // Make sure the parent block allows inserting non-PHI instructions
+          // before the terminator.
+          BasicBlock *Parent =
+              BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
+          if (!Parent->getTerminator()->isEHPad())
             LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
-          }
+        }
       }
 
       return false;
@@ -5622,7 +5623,6 @@ static bool FindAllMemoryUses(
                            PSI, BFI, SeenInsts);
 }
 
-
 /// Return true if Val is already known to be live at the use site that we're
 /// folding it into. If so, there is no cost to include it in the addressing
 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index ba0b025167307..cf0f47beb4c03 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -90,6 +90,19 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   return CachedMCSymbol;
 }
 
+MCSymbol *MachineBasicBlock::getCallInstSymbol(unsigned CallInstNumber) const {
+  if (CallInstSymbols.size() <= CallInstNumber) {
+    const MachineFunction *MF = getParent();
+    MCContext &Ctx = MF->getContext();
+    CallInstSymbols.resize(CallInstNumber + 1);
+    CallInstSymbols[CallInstNumber] = Ctx.createBlockSymbol(
+        "BB" + Twine(MF->getFunctionNumber()) + "_" + Twine(getNumber()) + "_" +
+            Twine(CallInstNumber),
+        /*AlwaysEmit=*/true);
+  }
+  return CallInstSymbols[CallInstNumber];
+}
+
 MCSymbol *MachineBasicBlock::getEHContSymbol() const {
   if (!CachedEHContMCSymbol) {
     const MachineFunction *MF = getParent();
@@ -211,8 +224,8 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
 
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition() ||
-                    TII->isBasicBlockPrologue(*I)))
+  while (I != E &&
+         (I->isPHI() || I->isPosition() || TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels
   // inside the bundle.
@@ -296,9 +309,7 @@ bool MachineBasicBlock::isEntryBlock() const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
-  print(dbgs());
-}
+LLVM_DUMP_METHOD void MachineBasicBlock::dump() const { print(dbgs()); }
 #endif
 
 bool MachineBasicBlock::mayHaveInlineAsmBr() const {
@@ -378,7 +389,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   // Print the preds of this block according to the CFG.
   if (!pred_empty() && IsStandalone) {
-    if (Indexes) OS << '\t';
+    if (Indexes)
+      OS << '\t';
     // Don't indent(2), align with previous line attributes.
     OS << "; predecessors: ";
     ListSeparator LS;
@@ -389,7 +401,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
 
   if (!succ_empty()) {
-    if (Indexes) OS << '\t';
+    if (Indexes)
+      OS << '\t';
     // Print the successors
     OS.indent(2) << "successors: ";
     ListSeparator LS;
@@ -420,7 +433,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
 
   if (!livein_empty() && MRI.tracksLiveness()) {
-    if (Indexes) OS << '\t';
+    if (Indexes)
+      OS << '\t';
     OS.indent(2) << "liveins: ";
 
     ListSeparator LS;
@@ -463,7 +477,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS.indent(2) << "}\n";
 
   if (IrrLoopHeaderWeight && IsStandalone) {
-    if (Indexes) OS << '\t';
+    if (Indexes)
+      OS << '\t';
     OS.indent(2) << "; Irreducible loop header weight: " << *IrrLoopHeaderWeight
                  << '\n';
   }
@@ -659,8 +674,8 @@ void MachineBasicBlock::sortUniqueLiveIns() {
   LiveIns.erase(Out, LiveIns.end());
 }
 
-Register
-MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
+Register MachineBasicBlock::addLiveIn(MCRegister PhysReg,
+                                      const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
   assert(PhysReg.isPhysical() && "Expected physreg");
   assert(RC && "Register class is required");
@@ -674,7 +689,7 @@ MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC)
 
   // Look for an existing copy.
   if (LiveIn)
-    for (;I != E && I->isCopy(); ++I)
+    for (; I != E && I->isCopy(); ++I)
       if (I->getOperand(1).getReg() == PhysReg) {
         Register VirtReg = I->getOperand(0).getReg();
         if (!MRI.constrainRegClass(VirtReg, RC))
@@ -685,7 +700,7 @@ MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC)
   // No luck, create a virtual register.
   Register VirtReg = MRI.createVirtualRegister(RC);
   BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
-    .addReg(PhysReg, RegState::Kill);
+      .addReg(PhysReg, RegState::Kill);
   if (!LiveIn)
     addLiveIn(PhysReg);
   return VirtReg;
@@ -722,7 +737,7 @@ void MachineBasicBlock::updateTerminator(
   SmallVector<MachineOperand, 4> Cond;
   DebugLoc DL = findBranchDebugLoc();
   bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
-  (void) B;
+  (void)B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
   if (Cond.empty()) {
     if (TBB) {
@@ -951,8 +966,8 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
   }
 }
 
-void
-MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
+void MachineBasicBlock::transferSuccessorsAndUpdatePHIs(
+    MachineBasicBlock *FromMBB) {
   if (this == FromMBB)
     return;
 
@@ -1019,7 +1034,8 @@ MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
   }
 
   // If there is no branch, control always falls through.
-  if (!TBB) return &*Fallthrough;
+  if (!TBB)
+    return &*Fallthrough;
 
   // If there is some explicit branch to the fallthrough block, it can obviously
   // reach, even though the branch should get folded to fall through implicitly.
@@ -1029,16 +1045,15 @@ MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
 
   // If it's an unconditional branch to some block not the fall through, it
   // doesn't fall through.
-  if (Cond.empty()) return nullptr;
+  if (Cond.empty())
+    return nullptr;
 
   // Otherwise, if it is conditional and has no explicit false block, it falls
   // through.
   return (FBB == nullptr) ? &*Fallthrough : nullptr;
 }
 
-bool MachineBasicBlock::canFallThrough() {
-  return getFallThrough() != nullptr;
-}
+bool MachineBasicBlock::canFallThrough() { return getFallThrough() != nullptr; }
 
 MachineBasicBlock *MachineBasicBlock::splitAt(MachineInstr &MI,
                                               bool UpdateLiveIns,
@@ -1312,7 +1327,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     // will extend to the end of the new split block.
 
     bool isLastMBB =
-      std::next(MachineFunction::iterator(NMBB)) == getParent()->end();
+        std::next(MachineFunction::iterator(NMBB)) == getParent()->end();
 
     SlotIndex StartIndex = Indexes->getMBBEndIdx(this);
     SlotIndex PrevIndex = StartIndex.getPrevSlot();
@@ -1320,11 +1335,11 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 
     // Find the registers used from NMBB in PHIs in Succ.
     SmallSet<Register, 8> PHISrcRegs;
-    for (MachineBasicBlock::instr_iterator
-         I = Succ->instr_begin(), E = Succ->instr_end();
+    for (MachineBasicBlock::instr_iterator I = Succ->instr_begin(),
+                                           E = Succ->instr_end();
          I != E && I->isPHI(); ++I) {
       for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
-        if (I->getOperand(ni+1).getMBB() == NMBB) {
+        if (I->getOperand(ni + 1).getMBB() == NMBB) {
           MachineOperand &MO = I->getOperand(ni);
           Register Reg = MO.getReg();
           PHISrcRegs.insert(Reg);
@@ -1488,8 +1503,8 @@ MachineInstr *MachineBasicBlock::remove_instr(MachineInstr *MI) {
   return Insts.remove(MI);
 }
 
-MachineBasicBlock::instr_iterator
-MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) {
+MachineBasicBlock::instr_iterator MachineBasicBlock::insert(instr_iterator I,
+                                                            MachineInstr *MI) {
   assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
          "Cannot insert instruction with bundle flags");
   // Set the bundle flags when inserting inside a bundle.
@@ -1523,7 +1538,8 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
   MachineBasicBlock::instr_iterator I = instr_end();
   while (I != instr_begin()) {
     --I;
-    if (!I->isTerminator()) break;
+    if (!I->isTerminator())
+      break;
 
     // Scan the operands of this machine instruction, replacing any uses of Old
     // with New.
@@ -1548,8 +1564,7 @@ void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old,
 
 /// Find the next valid DebugLoc starting at MBBI, skipping any debug
 /// instructions.  Return UnknownLoc if there is none.
-DebugLoc
-MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
+DebugLoc MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   // Skip debug declarations, we don't want a DebugLoc from them.
   MBBI = skipDebugInstructionsForward(MBBI, instr_end());
   if (MBBI != instr_end())
@@ -1591,8 +1606,7 @@ DebugLoc MachineBasicBlock::rfindPrevDebugLoc(reverse_instr_iterator MBBI) {
 
 /// Find and return the merged DebugLoc of the branch instructions of the block.
 /// Return UnknownLoc if there is none.
-DebugLoc
-MachineBasicBlock::findBranchDebugLoc() {
+DebugLoc MachineBasicBlock::findBranchDebugLoc() {
   DebugLoc DL;
   auto TI = getFirstTerminator();
   while (TI != end() && !TI->isBranch())
@@ -1600,7 +1614,7 @@ MachineBasicBlock::findBranchDebugLoc() {
 
   if (TI != end()) {
     DL = TI->getDebugLoc();
-    for (++TI ; TI != end() ; ++TI)
+    for (++TI; TI != end(); ++TI)
       if (TI->isBranch())
         DL = DebugLoc::getMergedLocation(DL, TI->getDebugLoc());
   }
@@ -1682,7 +1696,8 @@ MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
 /// instructions after (searching just for defs) MI.
 MachineBasicBlock::LivenessQueryResult
 MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
-                                           MCRegister Reg, const_iterator Before,
+                                           MCRegister Reg,
+                                           const_iterator Before,
                                            unsigned Neighborhood) const {
   unsigned N = Neighborhood;
 
@@ -1717,7 +1732,6 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
     return LQR_Dead;
   }
 
-
   N = Neighborhood;
 
   // Start by searching backwards from Before, looking for kills, reads or defs.
@@ -1792,9 +1806,7 @@ MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
   return isReturnBlock() && !succ_empty() ? TRI->getNoPreservedMask() : nullptr;
 }
 
-void MachineBasicBlock::clearLiveIns() {
-  LiveIns.clear();
-}
+void MachineBasicBlock::clearLiveIns() { LiveIns.clear(); }
 
 void MachineBasicBlock::clearLiveIns(
     std::vector<RegisterMaskPair> &OldLiveIns) {
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..522532aa9be5d 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -1,126 +1,113 @@
 add_llvm_component_group(X86 HAS_JIT)
 
-set(LLVM_TARGET_DEFINITIONS X86.td)
+    set(LLVM_TARGET_DEFINITIONS X86.td)
 
-tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM X86GenInstrMapping.inc -gen-x86-instr-mapping)
-tablegen(LLVM X86GenExegesis.inc -gen-exegesis)
-tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
-tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info
-                                  -instr-info-expand-mi-operand-info=0)
-tablegen(LLVM X86GenMnemonicTables.inc -gen-x86-mnemonic-tables -asmwriternum=1)
-tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
-tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables -asmwriternum=1)
+        tablegen(LLVM X86GenAsmMatcher.inc - gen - asm - matcher) tablegen(
+            LLVM X86GenAsmWriter.inc - gen - asm -
+            writer) tablegen(LLVM X86GenAsmWriter1.inc - gen - asm - writer -
+                                 asmwriternum =
+                                 1) tablegen(LLVM X86GenCallingConv.inc - gen -
+                                             callingconv)
+            tablegen(LLVM X86GenDAGISel.inc - gen - dag - isel) tablegen(
+                LLVM X86GenDisassemblerTables.inc - gen -
+                disassembler) tablegen(LLVM X86GenInstrMapping.inc - gen - x86 -
+                                       instr -
+                                       mapping) tablegen(LLVM X86GenExegesis
+                                                             .inc -
+                                                         gen - exegesis)
+                tablegen(LLVM X86GenFastISel.inc - gen - fast - isel) tablegen(
+                    LLVM X86GenGlobalISel.inc - gen - global -
+                    isel) tablegen(LLVM X86GenInstrInfo.inc - gen - instr -
+                                       info - instr - info - expand - mi -
+                                       operand - info =
+                                       0) tablegen(LLVM X86GenMnemonicTables
+                                                           .inc -
+                                                       gen - x86 - mnemonic -
+                                                       tables - asmwriternum =
+                                                       1) tablegen(LLVM X86GenRegisterBank
+                                                                       .inc -
+                                                                   gen -
+                                                                   register -
+                                                                   bank)
+                    tablegen(
+                        LLVM X86GenRegisterInfo.inc - gen - register -
+                        info) tablegen(LLVM X86GenSubtargetInfo.inc - gen -
+                                       subtarget) tablegen(LLVM X86GenFoldTables
+                                                                   .inc -
+                                                               gen - x86 -
+                                                               fold - tables -
+                                                               asmwriternum = 1)
 
-add_public_tablegen_target(X86CommonTableGen)
+                        add_public_tablegen_target(X86CommonTableGen)
 
-set(sources
-  X86ArgumentStackSlotRebase.cpp
-  X86AsmPrinter.cpp
-  X86AvoidTrailingCall.cpp
-  X86CallFrameOptimization.cpp
-  X86CallingConv.cpp
-  X86CmovConversion.cpp
-  X86CodeGenPassBuilder.cpp
-  X86DomainReassignment.cpp
-  X86DiscriminateMemOps.cpp
-  X86LowerTileCopy.cpp
-  X86LowerAMXType.cpp
-  X86LowerAMXIntrinsics.cpp
-  X86TileConfig.cpp
-  X86FastPreTileConfig.cpp
-  X86FastTileConfig.cpp
-  X86PreTileConfig.cpp
-  X86ExpandPseudo.cpp
-  X86FastISel.cpp
-  X86FixupBWInsts.cpp
-  X86FixupLEAs.cpp
-  X86FixupInstTuning.cpp
-  X86FixupVectorConstants.cpp
-  X86AvoidStoreForwardingBlocks.cpp
-  X86DynAllocaExpander.cpp
-  X86FixupSetCC.cpp
-  X86FlagsCopyLowering.cpp
-  X86FloatingPoint.cpp
-  X86FrameLowering.cpp
-  X86ISelDAGToDAG.cpp
-  X86ISelLowering.cpp
-  X86ISelLoweringCall.cpp
-  X86IndirectBranchTracking.cpp
-  X86IndirectThunks.cpp
-  X86InterleavedAccess.cpp
-  X86InsertPrefetch.cpp
-  X86InstCombineIntrinsic.cpp
-  X86InstrFMA3Info.cpp
-  X86InstrFoldTables.cpp
-  X86InstrInfo.cpp
-  X86CompressEVEX.cpp
-  X86LoadValueInjectionLoadHardening.cpp
-  X86LoadValueInjectionRetHardening.cpp
-  X86MCInstLower.cpp
-  X86MachineFunctionInfo.cpp
-  X86MacroFusion.cpp
-  X86OptimizeLEAs.cpp
-  X86PadShortFunction.cpp
-  X86PartialReduction.cpp
-  X86RegisterInfo.cpp
-  X86ReturnThunks.cpp
-  X86SelectionDAGInfo.cpp
-  X86ShuffleDecodeConstantPool.cpp
-  X86SpeculativeLoadHardening.cpp
-  X86SpeculativeExecutionSideEffectSuppression.cpp
-  X86Subtarget.cpp
-  X86SuppressAPXForReloc.cpp
-  X86TargetMachine.cpp
-  X86TargetObjectFile.cpp
-  X86TargetTransformInfo.cpp
-  X86VZeroUpper.cpp
-  X86WinEHState.cpp
-  X86WinEHUnwindV2.cpp
-  X86InsertWait.cpp
-  GISel/X86CallLowering.cpp
-  GISel/X86InstructionSelector.cpp
-  GISel/X86LegalizerInfo.cpp
-  GISel/X86RegisterBankInfo.cpp
-  )
+                            set(sources X86ArgumentStackSlotRebase
+                                    .cpp X86AsmPrinter.cpp X86AvoidTrailingCall
+                                    .cpp X86CallFrameOptimization
+                                    .cpp X86CallingConv.cpp X86CmovConversion
+                                    .cpp X86CodeGenPassBuilder
+                                    .cpp X86DomainReassignment
+                                    .cpp X86DiscriminateMemOps
+                                    .cpp X86LowerTileCopy.cpp X86LowerAMXType
+                                    .cpp X86LowerAMXIntrinsics.cpp X86TileConfig
+                                    .cpp X86FastPreTileConfig
+                                    .cpp X86FastTileConfig.cpp X86PreTileConfig
+                                    .cpp X86ExpandPseudo.cpp X86FastISel
+                                    .cpp X86FixupBWInsts.cpp X86FixupLEAs
+                                    .cpp X86FixupInstTuning
+                                    .cpp X86FixupVectorConstants
+                                    .cpp X86AvoidStoreForwardingBlocks
+                                    .cpp X86DynAllocaExpander.cpp X86FixupSetCC
+                                    .cpp X86FlagsCopyLowering
+                                    .cpp X86FloatingPoint.cpp X86FrameLowering
+                                    .cpp X86ISelDAGToDAG.cpp X86ISelLowering
+                                    .cpp X86ISelLoweringCall
+                                    .cpp X86IndirectBranchTracking
+                                    .cpp X86IndirectThunks
+                                    .cpp X86InterleavedAccess
+                                    .cpp X86InsertPrefetch.cpp PrefetchInsertion
+                                    .cpp X86InstCombineIntrinsic
+                                    .cpp X86InstrFMA3Info.cpp X86InstrFoldTables
+                                    .cpp X86InstrInfo.cpp X86CompressEVEX
+                                    .cpp X86LoadValueInjectionLoadHardening
+                                    .cpp X86LoadValueInjectionRetHardening
+                                    .cpp X86MCInstLower
+                                    .cpp X86MachineFunctionInfo
+                                    .cpp X86MacroFusion.cpp X86OptimizeLEAs
+                                    .cpp X86PadShortFunction
+                                    .cpp X86PartialReduction.cpp X86RegisterInfo
+                                    .cpp X86ReturnThunks.cpp X86SelectionDAGInfo
+                                    .cpp X86ShuffleDecodeConstantPool
+                                    .cpp X86SpeculativeLoadHardening
+                                    .cpp
+                                        X86SpeculativeExecutionSideEffectSuppression
+                                    .cpp X86Subtarget.cpp X86SuppressAPXForReloc
+                                    .cpp X86TargetMachine
+                                    .cpp X86TargetObjectFile
+                                    .cpp X86TargetTransformInfo
+                                    .cpp X86VZeroUpper.cpp X86WinEHState
+                                    .cpp X86WinEHUnwindV2.cpp X86InsertWait
+                                    .cpp GISel /
+                                X86CallLowering.cpp GISel /
+                                X86InstructionSelector.cpp GISel /
+                                X86LegalizerInfo.cpp GISel /
+                                X86RegisterBankInfo.cpp)
 
-add_llvm_target(X86CodeGen ${sources}
-  LINK_COMPONENTS
-  Analysis
-  AsmPrinter
-  CFGuard
-  CodeGen
-  CodeGenTypes
-  Core
-  GlobalISel
-  IRPrinter
-  Instrumentation
-  MC
-  ObjCARC
-  ProfileData
-  Scalar
-  SelectionDAG
-  Support
-  Target
-  TargetParser
-  TransformUtils
-  X86Desc
-  X86Info
+                                add_llvm_target(
+                                    X86CodeGen ${
+                                        sources} LINK_COMPONENTS Analysis
+                                        AsmPrinter CFGuard CodeGen CodeGenTypes Core
+                                            GlobalISel IRPrinter Instrumentation
+                                                MC ObjCARC ProfileData Scalar
+                                                    SelectionDAG Support Target
+                                                        TargetParser
+                                                            TransformUtils
+                                                                X86Desc X86Info
 
-  ADD_TO_COMPONENT
-  X86
-  )
+                                                                    ADD_TO_COMPONENT
+                                                                        X86)
 
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(MCA)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
+                                    add_subdirectory(AsmParser)
+                                        add_subdirectory(Disassembler)
+                                            add_subdirectory(MCA)
+                                                add_subdirectory(MCTargetDesc)
+                                                    add_subdirectory(TargetInfo)
diff --git a/llvm/lib/Target/X86/PrefetchInsertion.cpp b/llvm/lib/Target/X86/PrefetchInsertion.cpp
new file mode 100644
index 0000000000000..720a38cb9b011
--- /dev/null
+++ b/llvm/lib/Target/X86/PrefetchInsertion.cpp
@@ -0,0 +1,209 @@
+//===-- PrefetchInsertion.cpp ---=========-----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Prefetch insertion pass implementation.
+//===----------------------------------------------------------------------===//
+/// Prefetch insertion pass.
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+
+using namespace llvm;
+#define DEBUG_TYPE "prefetchinsertion"
+
+static cl::opt<bool> UseCodePrefetchInstruction(
+    "use-code-prefetch-instruction",
+    cl::desc("Whether to use the new prefetchit1 instruction."), cl::init(true),
+    cl::Hidden);
+static cl::opt<bool> PrefetchNextAddress(
+    "prefetch-next-address",
+    cl::desc(
+        "Whether to prefetch the next address instead of the target address."),
+    cl::init(false), cl::Hidden);
+
+namespace {} // end anonymous namespace
+
+namespace llvm {
+class PrefetchInsertion : public MachineFunctionPass {
+public:
+  static char ID;
+
+  BasicBlockSectionsProfileReaderWrapperPass *BBSectionsProfileReader = nullptr;
+
+  PrefetchInsertion() : MachineFunctionPass(ID) {
+    initializePrefetchInsertionPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Prefetch Insertion Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Identify basic blocks that need separate sections and prepare to emit them
+  /// accordingly.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace llvm
+
+char PrefetchInsertion::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    PrefetchInsertion, "prefetch-insertion",
+    "Applies path clonings for the -basic-block-sections=list option", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
+INITIALIZE_PASS_END(
+    PrefetchInsertion, "prefetch-insertion",
+    "Applies path clonings for the -basic-block-sections=list option", false,
+    false)
+
+bool PrefetchInsertion::runOnMachineFunction(MachineFunction &MF) {
+  assert(MF.getTarget().getBBSectionsType() == BasicBlockSection::List &&
+         "BB Sections list not enabled!");
+  if (hasInstrProfHashMismatch(MF))
+    return false;
+  // errs() << "Running on " << MF.getName() << "\n";
+  Function &F = MF.getFunction();
+  auto PtrTy = PointerType::getUnqual(F.getParent()->getContext());
+  DenseSet<BBPosition> PrefetchTargets =
+      getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
+          .getPrefetchTargetsForFunction(MF.getName());
+  // errs() << "Targets: Function: " << F.getName() << " "
+  //        << PrefetchTargets.size() << "\n";
+  DenseMap<UniqueBBID, SmallVector<unsigned>> PrefetchTargetsByBBID;
+  for (const auto &P : PrefetchTargets)
+    PrefetchTargetsByBBID[P.BBID].push_back(P.BBOffset);
+  for (auto &[BBID, V] : PrefetchTargetsByBBID)
+    llvm::sort(V);
+  for (auto &BB : MF)
+    BB.setPrefetchTargets(PrefetchTargetsByBBID[*BB.getBBID()]);
+
+  for (const BBPosition &P : PrefetchTargets) {
+    SmallString<128> PrefetchTargetName("__llvm_prefetch_target_");
+    PrefetchTargetName += F.getName();
+    PrefetchTargetName += "_";
+    PrefetchTargetName += utostr(P.BBID.BaseID);
+    PrefetchTargetName += "_";
+    PrefetchTargetName += utostr(P.BBOffset);
+    F.getParent()->getOrInsertGlobal(PrefetchTargetName, PtrTy);
+  }
+
+  SmallVector<PrefetchHint> PrefetchHints =
+      getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
+          .getPrefetchHintsForFunction(MF.getName());
+  // errs() << "Hints: Function: " << F.getName() << " " << PrefetchHints.size()
+  //        << "\n";
+  for (const PrefetchHint &H : PrefetchHints) {
+    SmallString<128> PrefetchTargetName("__llvm_prefetch_target_");
+    PrefetchTargetName += H.TargetFunctionName;
+    PrefetchTargetName += "_";
+    PrefetchTargetName += utostr(H.TargetPosition.BBID.BaseID);
+    PrefetchTargetName += "_";
+    PrefetchTargetName += utostr(H.TargetPosition.BBOffset);
+    F.getParent()->getOrInsertGlobal(PrefetchTargetName, PtrTy);
+  }
+
+  DenseMap<UniqueBBID, std::map<unsigned, SmallVector<PrefetchTarget>>>
+      PrefetchHintsByBBID;
+  for (const auto &H : PrefetchHints) {
+    PrefetchHintsByBBID[H.SitePosition.BBID][H.SitePosition.BBOffset].push_back(
+        PrefetchTarget{H.TargetFunctionName, H.TargetPosition.BBID,
+                       H.TargetPosition.BBOffset});
+  }
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  for (auto &BB : MF) {
+    auto It = PrefetchHintsByBBID.find(*BB.getBBID());
+    if (It == PrefetchHintsByBBID.end())
+      continue;
+    auto BBPrefetchHintIt = It->second.begin();
+    unsigned NumInsts = 0;
+    auto E = BB.getFirstTerminator();
+    unsigned NumCallsites = 0;
+    for (auto I = BB.instr_begin();;) {
+      auto Current = I;
+      if (NumCallsites >= BBPrefetchHintIt->first || Current == E) {
+        for (const auto &PrefetchTarget : BBPrefetchHintIt->second) {
+          SmallString<128> PrefetchTargetName("__llvm_prefetch_target_");
+          PrefetchTargetName += PrefetchTarget.TargetFunction;
+          PrefetchTargetName += "_";
+          PrefetchTargetName += utostr(PrefetchTarget.TargetBBID.BaseID);
+          PrefetchTargetName += "_";
+          PrefetchTargetName += utostr(PrefetchTarget.TargetBBOffset);
+          auto *GV =
+              MF.getFunction().getParent()->getNamedValue(PrefetchTargetName);
+          // errs() << "Inserting prefetch for " << GV->getName() << " at "
+          //        << MF.getName() << " " << BB.getName() << " " << NumInsts
+          //        << "\n";
+          MachineInstr *PFetch = MF.CreateMachineInstr(
+              UseCodePrefetchInstruction ? TII->get(X86::PREFETCHIT1)
+                                         : TII->get(X86::PREFETCHT1),
+              Current != BB.instr_end() ? Current->getDebugLoc() : DebugLoc(),
+              true);
+          PFetch->setFlag(MachineInstr::Prefetch);
+          MachineInstrBuilder MIB(MF, PFetch);
+          if (!PrefetchNextAddress) {
+            MIB.addMemOperand(MF.getMachineMemOperand(
+                MachinePointerInfo(GV), MachineMemOperand::MOLoad, /*s=*/8,
+                /*base_alignment=*/llvm::Align(1)));
+          }
+          MIB.addReg(X86::RIP).addImm(1).addReg(X86::NoRegister);
+          if (PrefetchNextAddress)
+            MIB.addImm(0);
+          else
+            MIB.addGlobalAddress(GV);
+          MIB.addReg(X86::NoRegister);
+          BB.insert(Current, PFetch);
+        }
+        ++BBPrefetchHintIt;
+        if (BBPrefetchHintIt == PrefetchHintsByBBID[*BB.getBBID()].end())
+          break;
+      }
+      if (Current != E) {
+        // Print the assembly for the instruction.
+        if (!Current->isPosition() && !Current->isImplicitDef() &&
+            !Current->isKill() && !Current->isDebugInstr()) {
+          ++NumInsts;
+        }
+        if (Current->isCall())
+          ++NumCallsites;
+        ++I;
+      }
+    }
+  }
+  return true;
+}
+
+void PrefetchInsertion::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<BasicBlockSectionsProfileReaderWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createPrefetchInsertionPass() {
+  return new PrefetchInsertion();
+}
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..5fd0eb8f4f594 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -76,7 +76,8 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
-/// Return a pass that avoids creating store forward block issues in the hardware.
+/// Return a pass that avoids creating store forward block issues in the
+/// hardware.
 FunctionPass *createX86AvoidStoreForwardingBlocks();
 
 /// Return a pass that lowers EFLAGS copy pseudo instructions.
@@ -134,8 +135,8 @@ FunctionPass *createX86FixupBWInsts();
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
 
-/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when
-/// possible in order to reduce code size or facilitate HW decoding.
+/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space
+/// when possible in order to reduce code size or facilitate HW decoding.
 FunctionPass *createX86CompressEVEXPass();
 
 /// This pass creates the thunks for the retpoline feature.
@@ -151,6 +152,8 @@ FunctionPass *createX86DiscriminateMemOpsPass();
 /// This pass applies profiling information to insert cache prefetches.
 FunctionPass *createX86InsertPrefetchPass();
 
+FunctionPass *createPrefetchInsertionPass();
+
 /// This pass insert wait instruction after X87 instructions which could raise
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
@@ -238,8 +241,8 @@ enum : unsigned {
   PTR32_UPTR = 271,
   PTR64 = 272
 };
-} // End X86AS namespace
+} // namespace X86AS
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..2c777e7f4ac38 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -54,9 +54,10 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
-                               cl::desc("Enable the machine combiner pass"),
-                               cl::init(true), cl::Hidden);
+static cl::opt<bool>
+    EnableMachineCombinerPass("x86-machine-combiner",
+                              cl::desc("Enable the machine combiner pass"),
+                              cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableTileRAPass("x86-tile-ra",
@@ -362,7 +363,7 @@ namespace {
 class X86PassConfig : public TargetPassConfig {
 public:
   X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   X86TargetMachine &getX86TargetMachine() const {
     return getTM<X86TargetMachine>();
@@ -401,10 +402,10 @@ char X86ExecutionDomainFix::ID;
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
-  "X86 Execution Domain Fix", false, false)
+                      "X86 Execution Domain Fix", false, false)
 INITIALIZE_PASS_DEPENDENCY(ReachingDefInfoWrapperPass)
 INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
-  "X86 Execution Domain Fix", false, false)
+                    "X86 Execution Domain Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(*this, PM);
@@ -627,6 +628,9 @@ void X86PassConfig::addPreEmitPass2() {
   // after all real instructions have been added to the epilog.
   if (TT.isOSWindows() && TT.isX86_64())
     addPass(createX86WinEHUnwindV2Pass());
+
+  if (TM->getBBSectionsType() == llvm::BasicBlockSection::List)
+    addPass(createPrefetchInsertionPass());
 }
 
 bool X86PassConfig::addPostFastRegAllocRewrite() {



More information about the llvm-commits mailing list