[llvm] 30e612e - TLS loads opimization (hoist)

Xiang1 Zhang via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 1 18:37:38 PST 2022


Author: Xiang1 Zhang
Date: 2022-03-02T10:37:24+08:00
New Revision: 30e612ebdfb0f243eb63d93487790a53c26ae873

URL: https://github.com/llvm/llvm-project/commit/30e612ebdfb0f243eb63d93487790a53c26ae873
DIFF: https://github.com/llvm/llvm-project/commit/30e612ebdfb0f243eb63d93487790a53c26ae873.diff

LOG: TLS loads opimization (hoist)
Reviewed By: Wang Pheobe, Topper Craig

Differential Revision: https://reviews.llvm.org/D120000

Added: 
    llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
    llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
    llvm/test/CodeGen/X86/tls-loads-control.ll
    llvm/test/CodeGen/X86/tls-loads-control2.ll
    llvm/test/CodeGen/X86/tls-loads-control3.ll

Modified: 
    llvm/include/llvm/CodeGen/MachinePassRegistry.def
    llvm/include/llvm/InitializePasses.h
    llvm/include/llvm/LinkAllPasses.h
    llvm/include/llvm/Transforms/Scalar.h
    llvm/lib/CodeGen/TargetPassConfig.cpp
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Passes/PassRegistry.def
    llvm/lib/Transforms/Scalar/CMakeLists.txt
    llvm/lib/Transforms/Scalar/Scalar.cpp
    llvm/test/CodeGen/AArch64/O3-pipeline.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/ARM/O3-pipeline.ll
    llvm/test/CodeGen/PowerPC/O3-pipeline.ll
    llvm/test/CodeGen/X86/opt-pipeline.ll
    llvm/tools/llc/llc.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index e6763899a083b..a63f405e7e77e 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -47,6 +47,7 @@ FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
 FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ())
+FUNCTION_PASS("tlshoist", TLSVariableHoistPass, ())
 FUNCTION_PASS("verify", VerifierPass, ())
 #undef FUNCTION_PASS
 

diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 16870b924dd24..3a98bacef81d0 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -445,6 +445,7 @@ void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
 void initializeThreadSanitizerLegacyPassPass(PassRegistry&);
+void initializeTLSVariableHoistLegacyPassPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionPass(PassRegistry&);

diff  --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 0e7432ba3f537..bd8bea6b99a06 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -177,6 +177,7 @@ namespace {
       (void) llvm::createStripDeadDebugInfoPass();
       (void) llvm::createStripDeadPrototypesPass();
       (void) llvm::createTailCallEliminationPass();
+      (void)llvm::createTLSVariableHoistPass();
       (void) llvm::createJumpThreadingPass();
       (void) llvm::createDFAJumpThreadingPass();
       (void) llvm::createUnifyFunctionExitNodesPass();

diff  --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 4d6874f784efb..887ddf943c65c 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -427,6 +427,12 @@ extern char &InferAddressSpacesID;
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
+//===----------------------------------------------------------------------===//
+//
+// TLSVariableHoist - This pass reduce duplicated TLS address call.
+//
+FunctionPass *createTLSVariableHoistPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and

diff  --git a/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
new file mode 100644
index 0000000000000..5feebf7a05dc3
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/TLSVariableHoist.h
@@ -0,0 +1,133 @@
+//==- TLSVariableHoist.h ------ Remove Redundant TLS Loads -------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies/eliminates Redundant TLS Loads if related option is set.
+// For example:
+// static __thread int x;
+// int g();
+// int f(int c) {
+//   int *px = &x;
+//   while (c--)
+//     *px += g();
+//   return *px;
+// }
+//
+// will generate Redundant TLS Loads by compiling it with
+// clang++ -fPIC -ftls-model=global-dynamic -O2 -S
+//
+// .LBB0_2:                                # %while.body
+//                                         # =>This Inner Loop Header: Depth=1
+//         callq   _Z1gv at PLT
+//         movl    %eax, %ebp
+//         leaq    _ZL1x at TLSLD(%rip), %rdi
+//         callq   __tls_get_addr at PLT
+//         addl    _ZL1x at DTPOFF(%rax), %ebp
+//         movl    %ebp, _ZL1x at DTPOFF(%rax)
+//         addl    $-1, %ebx
+//         jne     .LBB0_2
+//         jmp     .LBB0_3
+// .LBB0_4:                                # %entry.while.end_crit_edge
+//         leaq    _ZL1x at TLSLD(%rip), %rdi
+//         callq   __tls_get_addr at PLT
+//         movl    _ZL1x at DTPOFF(%rax), %ebp
+//
+// The Redundant TLS Loads will hurt the performance, especially in loops.
+// So we try to eliminate/move them if required by customers, let it be:
+//
+// # %bb.0:                                # %entry
+//         ...
+//         movl    %edi, %ebx
+//         leaq    _ZL1x at TLSLD(%rip), %rdi
+//         callq   __tls_get_addr at PLT
+//         leaq    _ZL1x at DTPOFF(%rax), %r14
+//         testl   %ebx, %ebx
+//         je      .LBB0_1
+// .LBB0_2:                                # %while.body
+//                                         # =>This Inner Loop Header: Depth=1
+//         callq   _Z1gv at PLT
+//         addl    (%r14), %eax
+//         movl    %eax, (%r14)
+//         addl    $-1, %ebx
+//         jne     .LBB0_2
+//         jmp     .LBB0_3
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H
+#define LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Function;
+class GlobalVariable;
+class Instruction;
+
+/// A private "module" namespace for types and utilities used by
+/// TLSVariableHoist. These are implementation details and should
+/// not be used by clients.
+namespace tlshoist {
+
+/// Keeps track of the user of a TLS variable and the operand index
+/// where the variable is used.
+struct TLSUser {
+  Instruction *Inst;
+  unsigned OpndIdx;
+
+  TLSUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) {}
+};
+
+/// Keeps track of a TLS variable candidate and its users.
+struct TLSCandidate {
+  SmallVector<TLSUser, 8> Users;
+
+  /// Add the user to the use list and update the cost.
+  void addUser(Instruction *Inst, unsigned Idx) {
+    Users.push_back(TLSUser(Inst, Idx));
+  }
+};
+
+} // end namespace tlshoist
+
+class TLSVariableHoistPass : public PassInfoMixin<TLSVariableHoistPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  // Glue for old PM.
+  bool runImpl(Function &F, DominatorTree &DT, LoopInfo &LI);
+
+private:
+  DominatorTree *DT;
+  LoopInfo *LI;
+
+  /// Keeps track of TLS variable candidates found in the function.
+  using TLSCandMapType = MapVector<GlobalVariable *, tlshoist::TLSCandidate>;
+  TLSCandMapType TLSCandMap;
+
+  void collectTLSCandidates(Function &Fn);
+  void collectTLSCandidate(Instruction *Inst);
+  BasicBlock::iterator findInsertPosInEntry(Function &Fn,
+                                            tlshoist::TLSCandidate &Cand);
+  Instruction *getNearestLoopDomInst(BasicBlock *BB);
+  Instruction *getDomInst(Instruction *I1, Instruction *I2);
+  BasicBlock::iterator findInsertPos(Function &Fn, GlobalVariable *GV,
+                                     BasicBlock *&PosBB);
+  Instruction *genBitCastInst(Function &Fn, GlobalVariable *GV);
+  bool tryReplaceTLSCandidates(Function &Fn);
+  bool tryReplaceTLSCandidate(Function &Fn, GlobalVariable *GV);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_TLSVARIABLEHOIST_H

diff  --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 2b7f0ed41e1c9..af8c311b605ea 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -922,6 +922,9 @@ void TargetPassConfig::addIRPasses() {
   // Allow disabling it for testing purposes.
   if (!DisableExpandReductions)
     addPass(createExpandReductionsPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createTLSVariableHoistPass());
 }
 
 /// Turn exception handling constructs into something the code generators can

diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index dedfc81f11bba..4af14b2717c01 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -212,6 +212,7 @@
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
+#include "llvm/Transforms/Scalar/TLSVariableHoist.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"

diff  --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 0c77509630d25..199324d5b93ea 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -361,6 +361,7 @@ FUNCTION_PASS("verify<safepoint-ir>", SafepointIRVerifierPass())
 FUNCTION_PASS("verify<scalar-evolution>", ScalarEvolutionVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
+FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("memprof", MemProfilerPass())

diff  --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 79559a4d14a07..2b90abb97a52c 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -77,6 +77,7 @@ add_llvm_component_library(LLVMScalarOpts
   StraightLineStrengthReduce.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
+  TLSVariableHoist.cpp
   WarnMissedTransforms.cpp
 
   ADDITIONAL_HEADER_DIRS

diff  --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index f9650efc051fb..a04fe339ad297 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -104,6 +104,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
+  initializeTLSVariableHoistLegacyPassPass(Registry);
   initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReduceLegacyPassPass(Registry);

diff  --git a/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
new file mode 100644
index 0000000000000..8102d6ad597dd
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
@@ -0,0 +1,341 @@
+//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies/eliminate Redundant TLS Loads if related option is set.
+// The example: Please refer to the comment at the head of TLSVariableHoist.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/TLSVariableHoist.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace tlshoist;
+
+#define DEBUG_TYPE "tlshoist"
+
+// TODO: Support "strict" model if we need to strictly load TLS address,
+// because "non-optimize" may also do some optimization in other passes.
+static cl::opt<std::string> TLSLoadHoist(
+    "tls-load-hoist",
+    cl::desc(
+        "hoist the TLS loads in PIC model: "
+        "tls-load-hoist=optimize: Eleminate redundant TLS load(s)."
+        "tls-load-hoist=strict: Strictly load TLS address before every use."
+        "tls-load-hoist=non-optimize: Generally load TLS before use(s)."),
+    cl::init("non-optimize"), cl::Hidden);
+
+namespace {
+
+/// The TLS Variable hoist pass.
+class TLSVariableHoistLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  TLSVariableHoistLegacyPass() : FunctionPass(ID) {
+    initializeTLSVariableHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "TLS Variable Hoist"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+private:
+  TLSVariableHoistPass Impl;
+};
+
+} // end anonymous namespace
+
+char TLSVariableHoistLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist",
+                      "TLS Variable Hoist", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist",
+                    "TLS Variable Hoist", false, false)
+
+FunctionPass *llvm::createTLSVariableHoistPass() {
+  return new TLSVariableHoistLegacyPass();
+}
+
+/// Perform the TLS Variable Hoist optimization for the given function.
+bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  bool MadeChange =
+      Impl.runImpl(Fn, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                   getAnalysis<LoopInfoWrapperPass>().getLoopInfo());
+
+  if (MadeChange) {
+    LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: "
+                      << Fn.getName() << '\n');
+    LLVM_DEBUG(dbgs() << Fn);
+  }
+  LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n");
+
+  return MadeChange;
+}
+
+void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    auto *GV = dyn_cast<GlobalVariable>(Inst->getOperand(Idx));
+    if (!GV || !GV->isThreadLocal())
+      continue;
+
+    // Add Candidate to TLSCandMap (GV --> Candidate).
+    TLSCandMap[GV].addUser(Inst, Idx);
+  }
+}
+
+void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) {
+  // First, quickly check if there is TLS Variable.
+  Module *M = Fn.getParent();
+
+  bool HasTLS = llvm::any_of(
+      M->globals(), [](GlobalVariable &GV) { return GV.isThreadLocal(); });
+
+  // If non, directly return.
+  if (!HasTLS)
+    return;
+
+  TLSCandMap.clear();
+
+  // Then, collect TLS Variable info.
+  for (BasicBlock &BB : Fn) {
+    // Ignore unreachable basic blocks.
+    if (!DT->isReachableFromEntry(&BB))
+      continue;
+
+    for (Instruction &Inst : BB)
+      collectTLSCandidate(&Inst);
+  }
+}
+
+static bool OneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) {
+  if (Cand.Users.size() != 1)
+    return false;
+
+  BasicBlock *BB = Cand.Users[0].Inst->getParent();
+  if (LI && LI->getLoopFor(BB))
+    return false;
+
+  return true;
+}
+
+BasicBlock::iterator
+TLSVariableHoistPass::findInsertPosInEntry(Function &Fn,
+                                           tlshoist::TLSCandidate &Cand) {
+  BasicBlock &Entry = Fn.getEntryBlock();
+
+  // The Entry BB is usually small, let quickly check if TLS used in it.
+  // If there is, directly use the first user as insert position.
+  for (auto &I : Entry) {
+    Instruction *Inst = &I;
+    bool UsedInEntry = llvm::any_of(
+        Cand.Users, [=](tlshoist::TLSUser &User) { return User.Inst == Inst; });
+    if (UsedInEntry)
+      return Inst->getIterator();
+  }
+
+  Instruction *Term = Entry.getTerminator();
+  if (Term)
+    return Term->getIterator();
+
+  // Entry is empty.
+  return Entry.end();
+}
+
+Instruction *TLSVariableHoistPass::getNearestLoopDomInst(BasicBlock *BB) {
+  Loop *L = LI->getLoopFor(BB);
+  assert(L && "Unexcepted Loop status!");
+
+  // Get the outmost loop.
+  while (Loop *Parent = L->getParentLoop())
+    L = Parent;
+
+  BasicBlock *PreHeader = L->getLoopPredecessor();
+
+  // There is unique predecessor outside the loop.
+  // Note the terminator maybe nullptr, because the PreHeader maybe an empty BB.
+  if (PreHeader)
+    return PreHeader->getTerminator();
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Dom = Header;
+  for (BasicBlock *PredBB : predecessors(Header))
+    Dom = DT->findNearestCommonDominator(Dom, PredBB);
+
+  assert(Dom && "Not find dominator BB!");
+  Instruction *Term = Dom->getTerminator();
+
+  assert(Term && "Not find terminator instruction!");
+  return Term;
+}
+
+Instruction *TLSVariableHoistPass::getDomInst(Instruction *I1,
+                                              Instruction *I2) {
+  if (!I1)
+    return I2;
+  if (DT->dominates(I1, I2))
+    return I1;
+  if (DT->dominates(I2, I1))
+    return I2;
+
+  // If there is no dominance relation, use common dominator.
+  BasicBlock *DomBB =
+      DT->findNearestCommonDominator(I1->getParent(), I2->getParent());
+
+  Instruction *Dom = DomBB->getTerminator();
+  assert(Dom && "Common dominator not found!");
+
+  return Dom;
+}
+
+BasicBlock::iterator TLSVariableHoistPass::findInsertPos(Function &Fn,
+                                                         GlobalVariable *GV,
+                                                         BasicBlock *&PosBB) {
+  tlshoist::TLSCandidate &Cand = TLSCandMap[GV];
+  if (!DT)
+    return findInsertPosInEntry(Fn, Cand);
+
+  // We should hoist the TLS use out of loop, so choose its nearest instruction
+  // which dominate the loop and the outside loops (if exist).
+  Instruction *LastPos = nullptr;
+  for (auto &User : Cand.Users) {
+    BasicBlock *BB = User.Inst->getParent();
+    Instruction *Pos = User.Inst;
+    if (LI && LI->getLoopFor(BB)) {
+      Pos = getNearestLoopDomInst(BB);
+      // The dominator of loop is empty BB, that rarely happened, so let
+      // things be easy, directly insert in entry BB.
+      if (!Pos)
+        return findInsertPosInEntry(Fn, Cand);
+    }
+    Pos = getDomInst(LastPos, Pos);
+    LastPos = Pos;
+  }
+
+  assert(LastPos && "Unexpected insert position!");
+  BasicBlock *Parent = LastPos->getParent();
+  PosBB = Parent;
+  return LastPos->getIterator();
+}
+
+// Generate a bitcast (no type change) to replace the uses of TLS Candidate.
+Instruction *TLSVariableHoistPass::genBitCastInst(Function &Fn,
+                                                  GlobalVariable *GV) {
+  BasicBlock *PosBB = &Fn.getEntryBlock();
+  BasicBlock::iterator Iter = findInsertPos(Fn, GV, PosBB);
+  Type *Ty = GV->getType();
+  auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast");
+  PosBB->getInstList().insert(Iter, CastInst);
+  return CastInst;
+}
+
+bool TLSVariableHoistPass::tryReplaceTLSCandidate(Function &Fn,
+                                                  GlobalVariable *GV) {
+
+  tlshoist::TLSCandidate &Cand = TLSCandMap[GV];
+
+  // If only used 1 time and not in loops, we no need to replace it.
+  if (OneUseOutsideLoop(Cand, LI))
+    return false;
+
+  // Generate a bitcast (no type change)
+  auto *CastInst = genBitCastInst(Fn, GV);
+
+  // to replace the uses of TLS Candidate
+  for (auto &User : Cand.Users)
+    User.Inst->setOperand(User.OpndIdx, CastInst);
+
+  return true;
+}
+
+bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) {
+  if (TLSCandMap.empty())
+    return false;
+
+  bool Replaced = false;
+  for (auto &GV2Cand : TLSCandMap) {
+    GlobalVariable *GV = GV2Cand.first;
+    Replaced |= tryReplaceTLSCandidate(Fn, GV);
+  }
+
+  return Replaced;
+}
+
+/// Optimize expensive TLS variables in the given function.
+bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT,
+                                   LoopInfo &LI) {
+  if (Fn.hasOptNone())
+    return false;
+
+  if (TLSLoadHoist != "optimize" &&
+      !Fn.getAttributes().hasFnAttr("tls-load-hoist"))
+    return false;
+
+  this->LI = &LI;
+  this->DT = &DT;
+  // Collect all TLS variable candidates.
+  collectTLSCandidates(Fn);
+
+  bool MadeChange = tryReplaceTLSCandidates(Fn);
+
+  return MadeChange;
+}
+
+PreservedAnalyses TLSVariableHoistPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (!runImpl(F, DT, LI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}

diff  --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index d91e3dd4e5dc5..d0898aa5231a9 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -61,6 +61,8 @@
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       TLS Variable Hoist
 ; CHECK-NEXT:     Stack Safety Analysis
 ; CHECK-NEXT:       FunctionPass Manager
 ; CHECK-NEXT:         Dominator Tree Construction

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6a281836c0658..fc3ae9b3c8c53 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -214,6 +214,8 @@
 ; GCN-O1-NEXT:      Expand vector predication intrinsics
 ; GCN-O1-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O1-NEXT:      Expand reduction intrinsics
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      TLS Variable Hoist
 ; GCN-O1-NEXT:    AMDGPU Attributor
 ; GCN-O1-NEXT:    CallGraph Construction
 ; GCN-O1-NEXT:    Call Graph SCC Pass Manager
@@ -484,6 +486,8 @@
 ; GCN-O1-OPTS-NEXT:      Expand vector predication intrinsics
 ; GCN-O1-OPTS-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O1-OPTS-NEXT:      Expand reduction intrinsics
+; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      TLS Variable Hoist
 ; GCN-O1-OPTS-NEXT:      Early CSE
 ; GCN-O1-OPTS-NEXT:    AMDGPU Attributor
 ; GCN-O1-OPTS-NEXT:    CallGraph Construction
@@ -769,6 +773,8 @@
 ; GCN-O2-NEXT:      Expand vector predication intrinsics
 ; GCN-O2-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O2-NEXT:      Expand reduction intrinsics
+; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      TLS Variable Hoist
 ; GCN-O2-NEXT:      Early CSE
 ; GCN-O2-NEXT:    AMDGPU Attributor
 ; GCN-O2-NEXT:    CallGraph Construction
@@ -1062,6 +1068,7 @@
 ; GCN-O3-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O3-NEXT:      Expand reduction intrinsics
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      TLS Variable Hoist
 ; GCN-O3-NEXT:      Phi Values Analysis
 ; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:      Function Alias Analysis Results

diff  --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index d25649dd13e5e..10c56a3c495b5 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -41,6 +41,7 @@
 ; CHECK-NEXT:      Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:      Expand reduction intrinsics
 ; CHECK-NEXT:      Natural Loop Information
+; CHECK-NEXT:      TLS Variable Hoist
 ; CHECK-NEXT:      Scalar Evolution Analysis
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results

diff  --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 1fa60d76dfbd2..d67fed77d3569 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -65,6 +65,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       TLS Variable Hoist
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation

diff  --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 49a2829833995..e83266f72e488 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -60,6 +60,8 @@
 ; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       TLS Variable Hoist
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       X86 Partial Reduction
 ; CHECK-NEXT:       Expand indirectbr instructions

diff  --git a/llvm/test/CodeGen/X86/tls-loads-control.ll b/llvm/test/CodeGen/X86/tls-loads-control.ll
new file mode 100644
index 0000000000000..9090cc283cfec
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-loads-control.ll
@@ -0,0 +1,248 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize --stop-after=tlshoist -o - %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --stop-after=tlshoist -o - %s | FileCheck %s
+
+; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with:
+; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm)
+
+; // Variable declaration and definition:
+; thread_local int thl_x;
+; thread_local int thl_x2;
+;
+; struct SS {
+;   char thl_c;
+;   int num;
+; };
+;
+; int gfunc();
+; int gfunc2(int);
+
+; // First function (@_Z2f1i):
+; int f1(int c) {
+;   while (c)
+;     c++;
+;
+;   int *px = &thl_x;
+;   c -= gfunc();
+;
+;   while(c++) {
+;     c = gfunc();
+;     while (c--)
+;       *px += gfunc2(thl_x2);
+;   }
+;   return *px;
+; }
+
+$_ZTW5thl_x = comdat any
+
+$_ZTW6thl_x2 = comdat any
+
+ at thl_x = thread_local global i32 0, align 4
+ at thl_x2 = thread_local global i32 0, align 4
+ at _ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4
+ at _ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4
+
+; Function Attrs: mustprogress uwtable
+define noundef i32 @_Z2f1i(i32 noundef %c) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z2f1i
+; CHECK:      entry:
+; CHECK-NEXT:   %call = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %phi.cmp = icmp eq i32 %call, 0
+; CHECK-NEXT:   %tls_bitcast1 = bitcast i32* @thl_x to i32*
+; CHECK-NEXT:   br i1 %phi.cmp, label %while.end11, label %while.body4.preheader
+
+; CHECK:      while.body4.preheader:
+; CHECK-NEXT:   %tls_bitcast = bitcast i32* @thl_x2 to i32*
+; CHECK-NEXT:   br label %while.body4
+
+; CHECK:      while.body4:
+; CHECK-NEXT:   %call5 = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %tobool7.not18 = icmp eq i32 %call5, 0
+; CHECK-NEXT:   br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8.preheader
+
+; CHECK:      while.body8.preheader:
+; CHECK-NEXT:   br label %while.body8
+
+; CHECK:      while.body4.backedge.loopexit:
+; CHECK-NEXT:   br label %while.body4.backedge
+
+; CHECK:      while.body4.backedge:
+; CHECK-NEXT:   br label %while.body4, !llvm.loop !4
+
+; CHECK:      while.body8:
+; CHECK-NEXT:   %c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body8.preheader ]
+; CHECK-NEXT:   %dec = add i32 %c.addr.219, -1
+; CHECK-NEXT:   %0 = load i32, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   %call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
+; CHECK-NEXT:   %1 = load i32, i32* %tls_bitcast1, align 4
+; CHECK-NEXT:   %add = add nsw i32 %1, %call9
+; CHECK-NEXT:   store i32 %add, i32* %tls_bitcast1, align 4
+; CHECK-NEXT:   %tobool7.not = icmp eq i32 %dec, 0
+; CHECK-NEXT:   br i1 %tobool7.not, label %while.body4.backedge.loopexit, label %while.body8, !llvm.loop !4
+
+; CHECK:      while.end11:
+; CHECK-NEXT:   %2 = load i32, i32* %tls_bitcast1, align 4
+; CHECK-NEXT:   ret i32 %2
+
+entry:
+  %call = tail call noundef i32 @_Z5gfuncv()
+  %phi.cmp = icmp eq i32 %call, 0
+  br i1 %phi.cmp, label %while.end11, label %while.body4
+
+while.body4:                                      ; preds = %entry, %while.body4.backedge
+  %call5 = tail call noundef i32 @_Z5gfuncv()
+  %tobool7.not18 = icmp eq i32 %call5, 0
+  br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8
+
+while.body4.backedge:                             ; preds = %while.body8, %while.body4
+  br label %while.body4, !llvm.loop !4
+
+while.body8:                                      ; preds = %while.body4, %while.body8
+  %c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body4 ]
+  %dec = add nsw i32 %c.addr.219, -1
+  %0 = load i32, i32* @thl_x2, align 4
+  %call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
+  %1 = load i32, i32* @thl_x, align 4
+  %add = add nsw i32 %1, %call9
+  store i32 %add, i32* @thl_x, align 4
+  %tobool7.not = icmp eq i32 %dec, 0
+  br i1 %tobool7.not, label %while.body4.backedge, label %while.body8, !llvm.loop !4
+
+while.end11:                                      ; preds = %entry
+  %2 = load i32, i32* @thl_x, align 4
+  ret i32 %2
+}
+
+; // Sencond function (@_Z2f2i):
+; int f2(int c) {
+;   thread_local struct SS st;
+;   c += gfunc();
+;   while (c--) {
+;     thl_x += gfunc();
+;     st.thl_c += (char)gfunc();
+;     st.num += gfunc();
+;   }
+;   return thl_x;
+; }
+declare noundef i32 @_Z5gfuncv() local_unnamed_addr #1
+
+declare noundef i32 @_Z6gfunc2i(i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: mustprogress uwtable
+define noundef i32 @_Z2f2i(i32 noundef %c) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z2f2i
+; CHECK:      entry:
+; CHECK-NEXT:   %call = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %add = add nsw i32 %call, %c
+; CHECK-NEXT:   %tobool.not12 = icmp eq i32 %add, 0
+; CHECK-NEXT:   %tls_bitcast = bitcast i32* @thl_x to i32*
+; CHECK-NEXT:   br i1 %tobool.not12, label %while.end, label %while.body.preheader
+
+; CHECK:      while.body.preheader:
+; CHECK-NEXT:   %tls_bitcast1 = bitcast i8* @_ZZ2f2iE2st.0 to i8*
+; CHECK-NEXT:   %tls_bitcast2 = bitcast i32* @_ZZ2f2iE2st.1 to i32*
+; CHECK-NEXT:   br label %while.body
+
+; CHECK:      while.body:
+; CHECK-NEXT:   %c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %while.body.preheader ]
+; CHECK-NEXT:   %dec = add i32 %c.addr.013, -1
+; CHECK-NEXT:   %call1 = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %0 = load i32, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   %add2 = add nsw i32 %0, %call1
+; CHECK-NEXT:   store i32 %add2, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   %call3 = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %1 = load i8, i8* %tls_bitcast1, align 4
+; CHECK-NEXT:   %2 = trunc i32 %call3 to i8
+; CHECK-NEXT:   %conv7 = add i8 %1, %2
+; CHECK-NEXT:   store i8 %conv7, i8* %tls_bitcast1, align 4
+; CHECK-NEXT:   %call8 = tail call noundef i32 @_Z5gfuncv()
+; CHECK-NEXT:   %3 = load i32, i32* %tls_bitcast2, align 4
+; CHECK-NEXT:   %add9 = add nsw i32 %3, %call8
+; CHECK-NEXT:   store i32 %add9, i32* %tls_bitcast2, align 4
+; CHECK-NEXT:   %tobool.not = icmp eq i32 %dec, 0
+; CHECK-NEXT:   br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+; CHECK:      while.end.loopexit:
+; CHECK-NEXT:   br label %while.end
+
+; CHECK:      while.end:
+; CHECK-NEXT:   %4 = load i32, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   ret i32 %4
+entry:
+  %call = tail call noundef i32 @_Z5gfuncv()
+  %add = add nsw i32 %call, %c
+  %tobool.not12 = icmp eq i32 %add, 0
+  br i1 %tobool.not12, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %entry ]
+  %dec = add nsw i32 %c.addr.013, -1
+  %call1 = tail call noundef i32 @_Z5gfuncv()
+  %0 = load i32, i32* @thl_x, align 4
+  %add2 = add nsw i32 %0, %call1
+  store i32 %add2, i32* @thl_x, align 4
+  %call3 = tail call noundef i32 @_Z5gfuncv()
+  %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4
+  %2 = trunc i32 %call3 to i8
+  %conv7 = add i8 %1, %2
+  store i8 %conv7, i8* @_ZZ2f2iE2st.0, align 4
+  %call8 = tail call noundef i32 @_Z5gfuncv()
+  %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4
+  %add9 = add nsw i32 %3, %call8
+  store i32 %add9, i32* @_ZZ2f2iE2st.1, align 4
+  %tobool.not = icmp eq i32 %dec, 0
+  br i1 %tobool.not, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %4 = load i32, i32* @thl_x, align 4
+  ret i32 %4
+}
+
+; // Third function (@_Z2f3i):
+; int f3(int c) {
+;   int *px = &thl_x;
+;   gfunc2(*px);
+;   gfunc2(*px);
+;   return 1;
+; }
+
+; Function Attrs: mustprogress uwtable
+define noundef i32 @_Z2f3i(i32 noundef %c) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z2f3i
+; CHECK:      entry:
+; CHECK-NEXT:   %tls_bitcast = bitcast i32* @thl_x to i32*
+; CHECK-NEXT:   %0 = load i32, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   %call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
+; CHECK-NEXT:   %1 = load i32, i32* %tls_bitcast, align 4
+; CHECK-NEXT:   %call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1)
+; CHECK-NEXT:   ret i32 1
+entry:
+  %0 = load i32, i32* @thl_x, align 4
+  %call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
+  %1 = load i32, i32* @thl_x, align 4
+  %call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1)
+  ret i32 1
+}
+
+; Function Attrs: uwtable
+define weak_odr hidden noundef i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat {
+  ret i32* @thl_x
+}
+
+; Function Attrs: uwtable
+define weak_odr hidden noundef i32* @_ZTW6thl_x2() local_unnamed_addr #2 comdat {
+  ret i32* @thl_x2
+}
+
+attributes #0 = { mustprogress uwtable "tls-load-hoist" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 2}
+!3 = !{!"clang version 15.0.0"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}

diff  --git a/llvm/test/CodeGen/X86/tls-loads-control2.ll b/llvm/test/CodeGen/X86/tls-loads-control2.ll
new file mode 100644
index 0000000000000..9124eb737c733
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-loads-control2.ll
@@ -0,0 +1,51 @@
+; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0
+; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2
+; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2
+
+$_ZTW5thl_x = comdat any
+
+ at thl_x = thread_local global i32 0, align 4
+
+; Function Attrs: mustprogress uwtable
+define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* @thl_x, align 4
+  %call = tail call i32 @_Z5gfunci(i32 %0)
+  %1 = load i32, i32* @thl_x, align 4
+  %call1 = tail call i32 @_Z5gfunci(i32 %1)
+  ret i32 1
+}
+
+;HOIST0-LABEL: _Z2f1i
+;HOIST0:     entry:
+;HOIST0-NEXT:  %tls_bitcast = bitcast i32* @thl_x to i32*
+;HOIST0-NEXT:  %0 = load i32, i32* %tls_bitcast, align 4
+;HOIST0-NEXT:  %call = tail call i32 @_Z5gfunci(i32 %0)
+;HOIST0-NEXT:  %1 = load i32, i32* %tls_bitcast, align 4
+;HOIST0-NEXT:  %call1 = tail call i32 @_Z5gfunci(i32 %1)
+;HOIST0-NEXT:  ret i32 1
+
+;HOIST2-LABEL: _Z2f1i
+;HOIST2:     entry:
+;HOIST2-NEXT:  %0 = load i32, i32* @thl_x, align 4
+;HOIST2-NEXT:  %call = tail call i32 @_Z5gfunci(i32 %0)
+;HOIST2-NEXT:  %1 = load i32, i32* @thl_x, align 4
+;HOIST2-NEXT:  %call1 = tail call i32 @_Z5gfunci(i32 %1)
+;HOIST2-NEXT:  ret i32 1
+
+declare i32 @_Z5gfunci(i32) local_unnamed_addr #1
+
+; Function Attrs: uwtable
+define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat {
+  ret i32* @thl_x
+}
+
+attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 1}

diff  --git a/llvm/test/CodeGen/X86/tls-loads-control3.ll b/llvm/test/CodeGen/X86/tls-loads-control3.ll
new file mode 100644
index 0000000000000..4bf2b566a34ba
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-loads-control3.ll
@@ -0,0 +1,358 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0
+; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2
+; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2
+
+; This test has no module flag {"tls-load-hoist", i32 0}, so use --tls-load-hoist=x
+; to choose the way of loading thread_local address.
+
+; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with:
+; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm)
+
+$_ZTW5thl_x = comdat any
+
+$_ZTW6thl_x2 = comdat any
+
+ at thl_x = thread_local global i32 0, align 4
+ at thl_x2 = thread_local global i32 0, align 4
+ at _ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4
+ at _ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4
+
+; For HOIST0, check call __tls_get_addr at PLT only one time for each thread_local variable.
+; For HOIST2, Check the default way: usually call __tls_get_addr at PLT every time when use thread_local variable.
+
+; Function Attrs: mustprogress uwtable
+define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 {
+; HOIST0-LABEL: _Z2f1i:
+; HOIST0:       # %bb.0: # %entry
+; HOIST0-NEXT:    pushq %r15
+; HOIST0-NEXT:    .cfi_def_cfa_offset 16
+; HOIST0-NEXT:    pushq %r14
+; HOIST0-NEXT:    .cfi_def_cfa_offset 24
+; HOIST0-NEXT:    pushq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 32
+; HOIST0-NEXT:    .cfi_offset %rbx, -32
+; HOIST0-NEXT:    .cfi_offset %r14, -24
+; HOIST0-NEXT:    .cfi_offset %r15, -16
+; HOIST0-NEXT:    movl %edi, %ebx
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    rex64
+; HOIST0-NEXT:    callq __tls_get_addr at PLT
+; HOIST0-NEXT:    movq %rax, %r14
+; HOIST0-NEXT:    testl %ebx, %ebx
+; HOIST0-NEXT:    je .LBB0_4
+; HOIST0-NEXT:  # %bb.1: # %while.body.preheader
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    leaq thl_x2 at TLSGD(%rip), %rdi
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    rex64
+; HOIST0-NEXT:    callq __tls_get_addr at PLT
+; HOIST0-NEXT:    movq %rax, %r15
+; HOIST0-NEXT:    .p2align 4, 0x90
+; HOIST0-NEXT:  .LBB0_2: # %while.body
+; HOIST0-NEXT:    # =>This Inner Loop Header: Depth=1
+; HOIST0-NEXT:    movl (%r15), %edi
+; HOIST0-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST0-NEXT:    addl (%r14), %eax
+; HOIST0-NEXT:    movl %eax, (%r14)
+; HOIST0-NEXT:    decl %ebx
+; HOIST0-NEXT:    jne .LBB0_2
+; HOIST0-NEXT:    jmp .LBB0_3
+; HOIST0-NEXT:  .LBB0_4: # %entry.while.end_crit_edge
+; HOIST0-NEXT:    movl (%r14), %eax
+; HOIST0-NEXT:  .LBB0_3: # %while.end
+; HOIST0-NEXT:    popq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 24
+; HOIST0-NEXT:    popq %r14
+; HOIST0-NEXT:    .cfi_def_cfa_offset 16
+; HOIST0-NEXT:    popq %r15
+; HOIST0-NEXT:    .cfi_def_cfa_offset 8
+; HOIST0-NEXT:    retq
+;
+; HOIST2-LABEL: _Z2f1i:
+; HOIST2:       # %bb.0: # %entry
+; HOIST2-NEXT:    pushq %rbp
+; HOIST2-NEXT:    .cfi_def_cfa_offset 16
+; HOIST2-NEXT:    pushq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 24
+; HOIST2-NEXT:    pushq %rax
+; HOIST2-NEXT:    .cfi_def_cfa_offset 32
+; HOIST2-NEXT:    .cfi_offset %rbx, -24
+; HOIST2-NEXT:    .cfi_offset %rbp, -16
+; HOIST2-NEXT:    testl %edi, %edi
+; HOIST2-NEXT:    je .LBB0_4
+; HOIST2-NEXT:  # %bb.1:
+; HOIST2-NEXT:    movl %edi, %ebx
+; HOIST2-NEXT:    .p2align 4, 0x90
+; HOIST2-NEXT:  .LBB0_2: # %while.body
+; HOIST2-NEXT:    # =>This Inner Loop Header: Depth=1
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x2 at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    movl (%rax), %edi
+; HOIST2-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST2-NEXT:    movl %eax, %ebp
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    addl (%rax), %ebp
+; HOIST2-NEXT:    movl %ebp, (%rax)
+; HOIST2-NEXT:    decl %ebx
+; HOIST2-NEXT:    jne .LBB0_2
+; HOIST2-NEXT:    jmp .LBB0_3
+; HOIST2-NEXT:  .LBB0_4: # %entry.while.end_crit_edge
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    movl (%rax), %ebp
+; HOIST2-NEXT:  .LBB0_3: # %while.end
+; HOIST2-NEXT:    movl %ebp, %eax
+; HOIST2-NEXT:    addq $8, %rsp
+; HOIST2-NEXT:    .cfi_def_cfa_offset 24
+; HOIST2-NEXT:    popq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 16
+; HOIST2-NEXT:    popq %rbp
+; HOIST2-NEXT:    .cfi_def_cfa_offset 8
+; HOIST2-NEXT:    retq
+entry:
+  %tobool.not3 = icmp eq i32 %c, 0
+  br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body
+
+entry.while.end_crit_edge:                        ; preds = %entry
+  %.pre = load i32, i32* @thl_x, align 4
+  br label %while.end
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ]
+  %dec = add nsw i32 %c.addr.04, -1
+  %0 = load i32, i32* @thl_x2, align 4
+  %call = tail call i32 @_Z6gfunc2i(i32 %0)
+  %1 = load i32, i32* @thl_x, align 4
+  %add = add nsw i32 %1, %call
+  store i32 %add, i32* @thl_x, align 4
+  %tobool.not = icmp eq i32 %dec, 0
+  br i1 %tobool.not, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry.while.end_crit_edge
+  %2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ]
+  ret i32 %2
+}
+
+declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1
+
+; Function Attrs: mustprogress uwtable
+define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 {
+; HOIST0-LABEL: _Z2f2i:
+; HOIST0:       # %bb.0: # %entry
+; HOIST0-NEXT:    pushq %r15
+; HOIST0-NEXT:    .cfi_def_cfa_offset 16
+; HOIST0-NEXT:    pushq %r14
+; HOIST0-NEXT:    .cfi_def_cfa_offset 24
+; HOIST0-NEXT:    pushq %r12
+; HOIST0-NEXT:    .cfi_def_cfa_offset 32
+; HOIST0-NEXT:    pushq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 40
+; HOIST0-NEXT:    pushq %rax
+; HOIST0-NEXT:    .cfi_def_cfa_offset 48
+; HOIST0-NEXT:    .cfi_offset %rbx, -40
+; HOIST0-NEXT:    .cfi_offset %r12, -32
+; HOIST0-NEXT:    .cfi_offset %r14, -24
+; HOIST0-NEXT:    .cfi_offset %r15, -16
+; HOIST0-NEXT:    movl %edi, %ebx
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    rex64
+; HOIST0-NEXT:    callq __tls_get_addr at PLT
+; HOIST0-NEXT:    movq %rax, %r14
+; HOIST0-NEXT:    testl %ebx, %ebx
+; HOIST0-NEXT:    je .LBB1_3
+; HOIST0-NEXT:  # %bb.1: # %while.body.preheader
+; HOIST0-NEXT:    leaq _ZZ2f2iE2st.0 at TLSLD(%rip), %rdi
+; HOIST0-NEXT:    callq __tls_get_addr at PLT
+; HOIST0-NEXT:    movq %rax, %rcx
+; HOIST0-NEXT:    leaq _ZZ2f2iE2st.0 at DTPOFF(%rax), %r15
+; HOIST0-NEXT:    leaq _ZZ2f2iE2st.1 at DTPOFF(%rax), %r12
+; HOIST0-NEXT:    .p2align 4, 0x90
+; HOIST0-NEXT:  .LBB1_2: # %while.body
+; HOIST0-NEXT:    # =>This Inner Loop Header: Depth=1
+; HOIST0-NEXT:    callq _Z5gfuncv at PLT
+; HOIST0-NEXT:    addl %eax, (%r14)
+; HOIST0-NEXT:    callq _Z5gfuncv at PLT
+; HOIST0-NEXT:    addb %al, (%r15)
+; HOIST0-NEXT:    callq _Z5gfuncv at PLT
+; HOIST0-NEXT:    addl %eax, (%r12)
+; HOIST0-NEXT:    decl %ebx
+; HOIST0-NEXT:    jne .LBB1_2
+; HOIST0-NEXT:  .LBB1_3: # %while.end
+; HOIST0-NEXT:    movl (%r14), %eax
+; HOIST0-NEXT:    addq $8, %rsp
+; HOIST0-NEXT:    .cfi_def_cfa_offset 40
+; HOIST0-NEXT:    popq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 32
+; HOIST0-NEXT:    popq %r12
+; HOIST0-NEXT:    .cfi_def_cfa_offset 24
+; HOIST0-NEXT:    popq %r14
+; HOIST0-NEXT:    .cfi_def_cfa_offset 16
+; HOIST0-NEXT:    popq %r15
+; HOIST0-NEXT:    .cfi_def_cfa_offset 8
+; HOIST0-NEXT:    retq
+;
+; HOIST2-LABEL: _Z2f2i:
+; HOIST2:       # %bb.0: # %entry
+; HOIST2-NEXT:    pushq %rbp
+; HOIST2-NEXT:    .cfi_def_cfa_offset 16
+; HOIST2-NEXT:    pushq %r14
+; HOIST2-NEXT:    .cfi_def_cfa_offset 24
+; HOIST2-NEXT:    pushq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 32
+; HOIST2-NEXT:    .cfi_offset %rbx, -32
+; HOIST2-NEXT:    .cfi_offset %r14, -24
+; HOIST2-NEXT:    .cfi_offset %rbp, -16
+; HOIST2-NEXT:    testl %edi, %edi
+; HOIST2-NEXT:    je .LBB1_3
+; HOIST2-NEXT:  # %bb.1: # %while.body.preheader
+; HOIST2-NEXT:    movl %edi, %ebx
+; HOIST2-NEXT:    .p2align 4, 0x90
+; HOIST2-NEXT:  .LBB1_2: # %while.body
+; HOIST2-NEXT:    # =>This Inner Loop Header: Depth=1
+; HOIST2-NEXT:    callq _Z5gfuncv at PLT
+; HOIST2-NEXT:    movl %eax, %ebp
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    addl %ebp, (%rax)
+; HOIST2-NEXT:    callq _Z5gfuncv at PLT
+; HOIST2-NEXT:    movl %eax, %ebp
+; HOIST2-NEXT:    leaq _ZZ2f2iE2st.0 at TLSLD(%rip), %rdi
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    movq %rax, %r14
+; HOIST2-NEXT:    addb %bpl, _ZZ2f2iE2st.0 at DTPOFF(%rax)
+; HOIST2-NEXT:    callq _Z5gfuncv at PLT
+; HOIST2-NEXT:    movl %eax, %ecx
+; HOIST2-NEXT:    movq %r14, %rax
+; HOIST2-NEXT:    addl %ecx, _ZZ2f2iE2st.1 at DTPOFF(%r14)
+; HOIST2-NEXT:    decl %ebx
+; HOIST2-NEXT:    jne .LBB1_2
+; HOIST2-NEXT:  .LBB1_3: # %while.end
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    movl (%rax), %eax
+; HOIST2-NEXT:    popq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 24
+; HOIST2-NEXT:    popq %r14
+; HOIST2-NEXT:    .cfi_def_cfa_offset 16
+; HOIST2-NEXT:    popq %rbp
+; HOIST2-NEXT:    .cfi_def_cfa_offset 8
+; HOIST2-NEXT:    retq
+entry:
+  %tobool.not9 = icmp eq i32 %c, 0
+  br i1 %tobool.not9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ]
+  %dec = add nsw i32 %c.addr.010, -1
+  %call = tail call i32 @_Z5gfuncv()
+  %0 = load i32, i32* @thl_x, align 4
+  %add = add nsw i32 %0, %call
+  store i32 %add, i32* @thl_x, align 4
+  %call1 = tail call i32 @_Z5gfuncv()
+  %1 = load i8, i8* @_ZZ2f2iE2st.0, align 4
+  %2 = trunc i32 %call1 to i8
+  %conv5 = add i8 %1, %2
+  store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4
+  %call6 = tail call i32 @_Z5gfuncv()
+  %3 = load i32, i32* @_ZZ2f2iE2st.1, align 4
+  %add7 = add nsw i32 %3, %call6
+  store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4
+  %tobool.not = icmp eq i32 %dec, 0
+  br i1 %tobool.not, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %4 = load i32, i32* @thl_x, align 4
+  ret i32 %4
+}
+
+declare i32 @_Z5gfuncv() local_unnamed_addr #1
+
+; Function Attrs: mustprogress uwtable
+define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 {
+; HOIST0-LABEL: _Z2f3i:
+; HOIST0:       # %bb.0: # %entry
+; HOIST0-NEXT:    pushq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 16
+; HOIST0-NEXT:    .cfi_offset %rbx, -16
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    data16
+; HOIST0-NEXT:    rex64
+; HOIST0-NEXT:    callq __tls_get_addr at PLT
+; HOIST0-NEXT:    movq %rax, %rbx
+; HOIST0-NEXT:    movl (%rax), %edi
+; HOIST0-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST0-NEXT:    movl (%rbx), %edi
+; HOIST0-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST0-NEXT:    movl $1, %eax
+; HOIST0-NEXT:    popq %rbx
+; HOIST0-NEXT:    .cfi_def_cfa_offset 8
+; HOIST0-NEXT:    retq
+;
+; HOIST2-LABEL: _Z2f3i:
+; HOIST2:       # %bb.0: # %entry
+; HOIST2-NEXT:    pushq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 16
+; HOIST2-NEXT:    .cfi_offset %rbx, -16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    leaq thl_x at TLSGD(%rip), %rdi
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    data16
+; HOIST2-NEXT:    rex64
+; HOIST2-NEXT:    callq __tls_get_addr at PLT
+; HOIST2-NEXT:    movq %rax, %rbx
+; HOIST2-NEXT:    movl (%rax), %edi
+; HOIST2-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST2-NEXT:    movl (%rbx), %edi
+; HOIST2-NEXT:    callq _Z6gfunc2i at PLT
+; HOIST2-NEXT:    movl $1, %eax
+; HOIST2-NEXT:    popq %rbx
+; HOIST2-NEXT:    .cfi_def_cfa_offset 8
+; HOIST2-NEXT:    retq
+entry:
+  %0 = load i32, i32* @thl_x, align 4
+  %call = tail call i32 @_Z6gfunc2i(i32 %0)
+  %1 = load i32, i32* @thl_x, align 4
+  %call1 = tail call i32 @_Z6gfunc2i(i32 %1)
+  ret i32 1
+}
+
+attributes #0 = { nounwind mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 1}

diff  --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index c07f4e66486c8..25efb45450991 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -369,6 +369,7 @@ int main(int argc, char **argv) {
   initializeHardwareLoopsPass(*Registry);
   initializeTransformUtils(*Registry);
   initializeReplaceWithVeclibLegacyPass(*Registry);
+  initializeTLSVariableHoistLegacyPassPass(*Registry);
 
   // Initialize debugging passes.
   initializeScavengerTestPass(*Registry);


        


More information about the llvm-commits mailing list