[clang] bacdf80 - Use @llvm.threadlocal.address intrinsic to access TLS variable

Chuanqi Xu via cfe-commits cfe-commits at lists.llvm.org
Sun Jul 31 20:05:25 PDT 2022


Author: Chuanqi Xu
Date: 2022-08-01T11:05:00+08:00
New Revision: bacdf80f42b46044262e97e98398d1bd0b75900d

URL: https://github.com/llvm/llvm-project/commit/bacdf80f42b46044262e97e98398d1bd0b75900d
DIFF: https://github.com/llvm/llvm-project/commit/bacdf80f42b46044262e97e98398d1bd0b75900d.diff

LOG: Use @llvm.threadlocal.address intrinsic to access TLS variable

This is successor for D125291. This revision would try to use
@llvm.threadlocal.address in clang to access TLS variable. The reason
why the OpenMP tests contains a lot of change is that they uses
utils/update_cc_test_checks.py to update their tests.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D129833

Added: 
    clang/test/CodeGenCXX/threadlocal_address.cpp

Modified: 
    clang/lib/CodeGen/CGExpr.cpp
    clang/lib/CodeGen/ItaniumCXXABI.cpp
    clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp
    clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
    clang/test/CodeGenCXX/cxx11-thread-local.cpp
    clang/test/CodeGenCXX/cxx1y-variable-template.cpp
    clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
    clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
    clang/test/CodeGenCXX/pr18635.cpp
    clang/test/Modules/initializers.cpp
    clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/for_reduction_task_codegen.cpp
    clang/test/OpenMP/parallel_copyin_codegen.cpp
    clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/parallel_master_codegen.cpp
    clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
    clang/test/OpenMP/parallel_reduction_task_codegen.cpp
    clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
    clang/test/OpenMP/reduction_implicit_map.cpp
    clang/test/OpenMP/sections_reduction_task_codegen.cpp
    clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
    clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/taskloop_reduction_codegen.cpp
    clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
    clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/threadprivate_codegen.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 0b8c200335a04..91a6b4e528472 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1729,6 +1729,10 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
                                                LValueBaseInfo BaseInfo,
                                                TBAAAccessInfo TBAAInfo,
                                                bool isNontemporal) {
+  if (auto *GV = dyn_cast<llvm::GlobalValue>(Addr.getPointer()))
+    if (GV->isThreadLocal())
+      Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV));
+
   if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
     // Boolean vectors use `iN` as storage type.
     if (ClangVecTy->isExtVectorBoolType()) {
@@ -1870,6 +1874,10 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
                                         LValueBaseInfo BaseInfo,
                                         TBAAAccessInfo TBAAInfo,
                                         bool isInit, bool isNontemporal) {
+  if (auto *GV = dyn_cast<llvm::GlobalValue>(Addr.getPointer()))
+    if (GV->isThreadLocal())
+      Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV));
+
   llvm::Type *SrcTy = Value->getType();
   if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
     auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
@@ -2600,6 +2608,10 @@ static LValue EmitGlobalVarDeclLValue(CodeGenFunction &CGF,
   }
 
   llvm::Value *V = CGF.CGM.GetAddrOfGlobalVar(VD);
+
+  if (VD->getTLSKind() != VarDecl::TLS_None)
+    V = CGF.Builder.CreateThreadLocalAddress(V);
+
   llvm::Type *RealVarTy = CGF.getTypes().ConvertTypeForMem(VD->getType());
   V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy);
   CharUnits Alignment = CGF.getContext().getDeclAlign(VD);
@@ -2869,6 +2881,10 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
       llvm_unreachable("DeclRefExpr for Decl not entered in LocalDeclMap?");
     }
 
+    // Handle threadlocal function locals.
+    if (VD->getTLSKind() != VarDecl::TLS_None)
+      addr =
+          addr.withPointer(Builder.CreateThreadLocalAddress(addr.getPointer()));
 
     // Check for OpenMP threadprivate variables.
     if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd &&

diff  --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index fc2ff15a6acdc..ef484927f9b9f 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2988,14 +2988,16 @@ void ItaniumCXXABI::EmitThreadLocalInitFuncs(
 
     // For a reference, the result of the wrapper function is a pointer to
     // the referenced object.
-    llvm::Value *Val = Var;
+    llvm::Value *Val = Builder.CreateThreadLocalAddress(Var);
+
     if (VD->getType()->isReferenceType()) {
       CharUnits Align = CGM.getContext().getDeclAlign(VD);
-      Val = Builder.CreateAlignedLoad(Var->getValueType(), Var, Align);
+      Val = Builder.CreateAlignedLoad(Var->getValueType(), Val, Align);
     }
     if (Val->getType() != Wrapper->getReturnType())
       Val = Builder.CreatePointerBitCastOrAddrSpaceCast(
           Val, Wrapper->getReturnType(), "");
+
     Builder.CreateRet(Val);
   }
 }

diff  --git a/clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp
index 3c78c7f7df902..46c16bdd94e39 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-instantiated.cpp
@@ -17,7 +17,8 @@ S *current() { return TLS<S>::mData; };
 
 // CHECK-LABEL: define weak_odr hidden {{.*}} @_ZTWN3TLSI1SE5mDataE() {{.*}} comdat {
 // CHECK: call void @_ZTHN3TLSI1SE5mDataE()
-// CHECK: ret {{.*}} @_ZN3TLSI1SE5mDataE
+// CHECK: [[TLSmData_ADDR:%[^ ]+]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZN3TLSI1SE5mDataE)
+// CHECK: ret {{.*}} [[TLSmData_ADDR]]
 
 // Unlike for a global, the global initialization function must not be in a
 // COMDAT with the variable, because it is referenced from the _ZTH function

diff  --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index 07245ef131baa..3fc960f1eefc4 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -15,7 +15,8 @@ int &g() { return r; }
 
 // CHECK: define {{.*}} @[[R_INIT:.*]]()
 // CHECK: call noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) i32* @_Z1fv()
-// CHECK: store i32* %{{.*}}, i32** @r, align 8
+// CHECK: %[[R_ADDR:.+]] = call i32** @llvm.threadlocal.address.p0p0i32(i32** @r)
+// CHECK: store i32* %{{.*}}, i32** %[[R_ADDR]], align 8
 
 // CHECK-LABEL: define{{.*}} nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) i32* @_Z1gv()
 // LINUX_AIX: call i32* @_ZTW1r()
@@ -26,7 +27,8 @@ int &g() { return r; }
 // DARWIN: define cxx_fast_tlscc noundef i32* @_ZTW1r() [[ATTR1:#[0-9]+]] {
 // LINUX_AIX: call void @_ZTH1r()
 // DARWIN: call cxx_fast_tlscc void @_ZTH1r()
-// CHECK: load i32*, i32** @r, align 8
+// CHECK: %[[R_ADDR2:.+]] = call i32** @llvm.threadlocal.address.p0p0i32(i32** @r)
+// CHECK: load i32*, i32** %[[R_ADDR2]], align 8
 // CHECK: ret i32* %{{.*}}
 
 // LINUX_AIX-LABEL: define internal void @__tls_init()

diff  --git a/clang/test/CodeGenCXX/cxx11-thread-local.cpp b/clang/test/CodeGenCXX/cxx11-thread-local.cpp
index a570296dfc2a8..4023daa8192ab 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local.cpp
@@ -108,7 +108,8 @@ void *e2 = V<char>::m + W<char>::m + &X<char>::m;
 
 // CHECK: define {{.*}} @[[A_INIT:.*]]()
 // CHECK: call{{.*}} i32 @_Z1fv()
-// CHECK-NEXT: store i32 {{.*}}, i32* @a, align 4
+// CHECK: [[A_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK-NEXT: store i32 {{.*}}, i32* [[A_ADDR]], align 4
 
 // CHECK-LABEL: define{{.*}} i32 @_Z1fv()
 int f() {
@@ -117,12 +118,14 @@ int f() {
   // CHECK: br i1 %[[NEED_INIT]]{{.*}}
 
   // CHECK: %[[CALL:.*]] = call{{.*}} i32 @_Z1gv()
-  // CHECK: store i32 %[[CALL]], i32* @_ZZ1fvE1n, align 4
+  // CHECK: [[N_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ1fvE1n)
+  // CHECK: store i32 %[[CALL]], i32* [[N_ADDR]], align 4
   // CHECK: store i8 1, i8* @_ZGVZ1fvE1n
   // CHECK: br label
   static thread_local int n = g();
-
-  // CHECK: load i32, i32* @_ZZ1fvE1n, align 4
+  
+  // CHECK: [[N_ADDR2:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ1fvE1n)
+  // CHECK: load i32, i32* [[N_ADDR2]], align 4
   return n;
 }
 
@@ -140,17 +143,20 @@ int f() {
 // LINUX: br label
 // AIX-NOT: br label
 // finally:
-// LINUX_AIX: ret i32* @b
+// LINUX_AIX: [[B_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @b)
+// LINUX_AIX: ret i32* [[B_ADDR]]
 // DARWIN-LABEL: declare cxx_fast_tlscc noundef i32* @_ZTW1b()
 // There is no definition of the thread wrapper on Darwin for external TLV.
 
 // CHECK: define {{.*}} @[[D_INIT:.*]]()
 // CHECK: call{{.*}} i32 @_Z1gv()
-// CHECK-NEXT: store i32 %{{.*}}, i32* @_ZL1d, align 4
+// CHECK-NEXT: [[D_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZL1d)
+// CHECK-NEXT: store i32 %{{.*}}, i32* [[D_ADDR]], align 4
 
 // CHECK: define {{.*}} @[[U_M_INIT:.*]]()
 // CHECK: call{{.*}} i32 @_Z1fv()
-// CHECK-NEXT: store i32 %{{.*}}, i32* @_ZN1U1mE, align 4
+// CHECK-NEXT: [[UM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1U1mE)
+// CHECK-NEXT: store i32 %{{.*}}, i32* [[UM_ADDR]], align 4
 
 // CHECK: define {{.*}} @[[E_INIT:.*]]()
 // LINUX_AIX: call i32* @_ZTWN1VIiE1mE()
@@ -164,18 +170,21 @@ int f() {
 // DARWIN-LABEL: define weak_odr hidden cxx_fast_tlscc noundef i32* @_ZTWN1VIiE1mE()
 // LINUX_AIX: call void @_ZTHN1VIiE1mE()
 // DARWIN: call cxx_fast_tlscc void @_ZTHN1VIiE1mE()
-// CHECK: ret i32* @_ZN1VIiE1mE
+// CHECK: [[VM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1VIiE1mE)
+// CHECK: ret i32* [[VM_ADDR]]
 
 // LINUX_AIX-LABEL: define weak_odr hidden noundef i32* @_ZTWN1WIiE1mE()
 // DARWIN-LABEL: define weak_odr hidden cxx_fast_tlscc noundef i32* @_ZTWN1WIiE1mE()
 // CHECK-NOT: call
-// CHECK: ret i32* @_ZN1WIiE1mE
+// CHECK: [[WM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1WIiE1mE)
+// CHECK: ret i32* [[WM_ADDR]]
 
 // LINUX_AIX-LABEL: define weak_odr hidden {{.*}}* @_ZTWN1XIiE1mE()
 // DARWIN-LABEL: define weak_odr hidden cxx_fast_tlscc {{.*}}* @_ZTWN1XIiE1mE()
 // LINUX_AIX: call void @_ZTHN1XIiE1mE()
 // DARWIN: call cxx_fast_tlscc void @_ZTHN1XIiE1mE()
-// CHECK: ret {{.*}}* @_ZN1XIiE1mE
+// CHECK: [[XM_ADDR:%.+]] = call %struct.Dtor* @llvm.threadlocal.address.p0s_struct.Dtors(%struct.Dtor* @_ZN1XIiE1mE)
+// CHECK: ret {{.*}}* [[XM_ADDR]]
 
 // LINUX_AIX: define internal void @[[VF_M_INIT]]()
 // DARWIN: define internal cxx_fast_tlscc void @[[VF_M_INIT]]()
@@ -185,7 +194,8 @@ int f() {
 // CHECK: br i1 %[[VF_M_INITIALIZED]],
 // need init:
 // CHECK: call{{.*}} i32 @_Z1gv()
-// CHECK: store i32 %{{.*}}, i32* @_ZN1VIfE1mE, align 4
+// CHECK: [[VFM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1VIfE1mE)
+// CHECK: store i32 %{{.*}}, i32* [[VFM_ADDR]], align 4
 // CHECK: store i8 1, i8* bitcast (i64* @_ZGVN1VIfE1mE to i8*)
 // CHECK: br label
 
@@ -212,21 +222,24 @@ int f() {
 // LINUX: br i1 icmp ne (void ()* @_ZTHN1VIcE1mE,
 // AIX-NOT: br i1 icmp ne (void ()* @_ZTHN1VIcE1mE
 // LINUX_AIX: call void @_ZTHN1VIcE1mE()
-// LINUX_AIX: ret i32* @_ZN1VIcE1mE
+// LINUX_AIX: [[VEM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1VIcE1mE)
+// LINUX_AIX: ret i32* [[VEM_ADDR]]
 
 // DARWIN: declare cxx_fast_tlscc noundef i32* @_ZTWN1WIcE1mE()
 // LINUX_AIX: define linkonce_odr hidden noundef i32* @_ZTWN1WIcE1mE() {{#[0-9]+}}{{( comdat)?}} {
 // LINUX: br i1 icmp ne (void ()* @_ZTHN1WIcE1mE,
 // AIX-NOT: br i1 icmp ne (void ()* @_ZTHN1WIcE1mE,
 // LINUX_AIX: call void @_ZTHN1WIcE1mE()
-// LINUX_AIX: ret i32* @_ZN1WIcE1mE
+// LINUX_AIX: [[WEM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1WIcE1mE)
+// LINUX_AIX: ret i32* [[WEM_ADDR]]
 
 // DARWIN: declare cxx_fast_tlscc {{.*}}* @_ZTWN1XIcE1mE()
 // LINUX_AIX: define linkonce_odr hidden {{.*}}* @_ZTWN1XIcE1mE() {{#[0-9]+}}{{( comdat)?}} {
 // LINUX: br i1 icmp ne (void ()* @_ZTHN1XIcE1mE,
 // AIX-NOT: br i1 icmp ne (void ()* @_ZTHN1XIcE1mE,
 // LINUX_AIX: call void @_ZTHN1XIcE1mE()
-// LINUX_AIX: ret {{.*}}* @_ZN1XIcE1mE
+// LINUX_AIX: [[XEM_ADDR:%.+]] = call %struct.Dtor* @llvm.threadlocal.address.p0s_struct.Dtors(%struct.Dtor* @_ZN1XIcE1mE)
+// LINUX_AIX: ret {{.*}}* [[XEM_ADDR]]
 
 struct S { S(); ~S(); };
 struct T { ~T(); };
@@ -317,7 +330,8 @@ void set_anon_i() {
 // CHECK: br i1 %[[V_M_INITIALIZED]],
 // need init:
 // CHECK: call{{.*}} i32 @_Z1gv()
-// CHECK: store i32 %{{.*}}, i32* @_ZN1VIiE1mE, align 4
+// CHECK: [[VEM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1VIiE1mE)
+// CHECK: store i32 %{{.*}}, i32* [[VEM_ADDR]], align 4
 // CHECK: store i8 1, i8* bitcast (i64* @_ZGVN1VIiE1mE to i8*)
 // CHECK: br label
 
@@ -359,7 +373,8 @@ void set_anon_i() {
 // DARWIN: define cxx_fast_tlscc noundef i32* @_ZTW1a()
 // LINUX_AIX:   call void @_ZTH1a()
 // DARWIN: call cxx_fast_tlscc void @_ZTH1a()
-// CHECK:   ret i32* @a
+// CHECK:   [[A_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK:   ret i32* [[A_ADDR]]
 // CHECK: }
 
 
@@ -372,7 +387,8 @@ void set_anon_i() {
 // DARWIN-LABEL: define cxx_fast_tlscc noundef i32* @_ZTWN1U1mE()
 // LINUX_AIX: call void @_ZTHN1U1mE()
 // DARWIN: call cxx_fast_tlscc void @_ZTHN1U1mE()
-// CHECK: ret i32* @_ZN1U1mE
+// CHECK: [[UM_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN1U1mE)
+// CHECK: ret i32* [[UM_ADDR]]
 
 // LINUX_AIX: declare extern_weak void @_ZTH1b() [[ATTR:#[0-9]+]]
 

diff  --git a/clang/test/CodeGenCXX/cxx1y-variable-template.cpp b/clang/test/CodeGenCXX/cxx1y-variable-template.cpp
index 856ba49621468..d67d12f6c7bc3 100644
--- a/clang/test/CodeGenCXX/cxx1y-variable-template.cpp
+++ b/clang/test/CodeGenCXX/cxx1y-variable-template.cpp
@@ -41,7 +41,8 @@ namespace PR42111 {
   // CHECK: icmp eq i8 {{.*}}, 0
   // CHECK: br i1
   // CHECK: call noundef i32 @_ZN7PR421111fEv(
-  // CHECK: store i32 {{.*}}, i32* @_ZN7PR4211112_GLOBAL__N_11nILi0EEE
+  // CHECK: [[N_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN7PR4211112_GLOBAL__N_11nILi0EEE)
+  // CHECK: store i32 {{.*}}, i32* [[N_ADDR]]
   // CHECK: store i8 1, i8* @_ZGVN7PR4211112_GLOBAL__N_11nILi0EEE
   int g() { return n<> + n<>; }
 }

diff  --git a/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp b/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
index 47bdb0df63cdd..6b511db5b98e6 100644
--- a/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
@@ -31,7 +31,8 @@ int get_a() { return a; }
 
 // CHECK-LABEL: define{{.*}} i32 @_Z5get_bv()
 // CHECK-NOT: call
-// CHECK: load i32, i32* @b
+// CHECK: [[B_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @b)
+// CHECK: load i32, i32* [[B_ADDR]]
 // CHECK-NOT: call
 // CHECK: }
 int get_b() { return b; }
@@ -52,7 +53,8 @@ int get_c() { return c; }
 // LINUX-LABEL: define weak_odr {{.*}} @_ZTW1c()
 // CHECK-NOT: br i1
 // CHECK-NOT: call
-// CHECK: ret i32* @c
+// CHECK: [[C_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @c)
+// CHECK: ret i32* [[C_ADDR]]
 // CHECK: }
 
 thread_local int c = 0;

diff  --git a/clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp b/clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
index 912c9aab93861..a571936c1a76a 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
@@ -38,7 +38,8 @@ extern inline S &f() {
 // CHECK-NEXT:  br label %[[init_end:.*]]
 
 // CHECK:     [[init_end]]:
-// CHECK-NEXT:  ret %struct.S* @"?s@?1??f@@YAAAUS@@XZ at 4U2@A"
+// CHECK: [[S_ADDR:%.+]] = call %struct.S* @llvm.threadlocal.address.p0s_struct.Ss(%struct.S* @"?s@?1??f@@YAAAUS@@XZ at 4U2@A")
+// CHECK-NEXT:  ret %struct.S* [[S_ADDR]]
 
 // CHECK:     [[lpad:.*]]:
 // CHECK-NEXT: cleanuppad within none []

diff  --git a/clang/test/CodeGenCXX/pr18635.cpp b/clang/test/CodeGenCXX/pr18635.cpp
index 94b5e4599956d..3cc2ff6bce90f 100644
--- a/clang/test/CodeGenCXX/pr18635.cpp
+++ b/clang/test/CodeGenCXX/pr18635.cpp
@@ -4,7 +4,9 @@
 // CHECK: [[X_GLOBAL:@[^ ]+]]{{.*}}thread_local global
 
 // returned somewhere in TLS wrapper:
-// CHECK: ret{{.*}}[[X_GLOBAL]]
+// CHECK: define {{.+}} ptr @_ZTW1x(
+// CHECK: [[X_GLOBAL_ADDR:%[^ ]+]] = call ptr @llvm.threadlocal.address.p0(ptr [[X_GLOBAL]])
+// CHECK: ret{{.*}}[[X_GLOBAL_ADDR]]
 
 template <typename T> class unique_ptr {
   template <typename F, typename S> struct pair {
@@ -19,4 +21,3 @@ template <typename T> class unique_ptr {
 
 thread_local unique_ptr<int> x;
 int main() { x = unique_ptr<int>(new int(5)); }
-

diff  --git a/clang/test/CodeGenCXX/threadlocal_address.cpp b/clang/test/CodeGenCXX/threadlocal_address.cpp
new file mode 100644
index 0000000000000..d61b578034a86
--- /dev/null
+++ b/clang/test/CodeGenCXX/threadlocal_address.cpp
@@ -0,0 +1,75 @@
+// Test that the use of thread local variables would be wrapped by @llvm.threadlocal.address intrinsics.
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -triple %itanium_abi_triple -o - %s -disable-llvm-passes | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -triple %itanium_abi_triple -o - -O1 %s | FileCheck %s -check-prefix=CHECK-O1
+// RUN: %clang_cc1 -std=c++11 -no-opaque-pointers -emit-llvm -triple %itanium_abi_triple -o - %s -disable-llvm-passes | FileCheck %s -check-prefix=CHECK-NOOPAQUE
+thread_local int i;
+int g() {
+  i++;
+  return i;
+}
+// CHECK: @i = thread_local global i32 0
+// CHECK: @_ZZ1fvE1j = internal thread_local global i32 0
+//
+// CHECK: @_Z1gv()
+// CHECK-NEXT: entry
+// CHECK-NEXT: %[[IA:.+]] = call ptr @llvm.threadlocal.address.p0(ptr @i)
+// CHECK-NEXT: %[[VA:.+]] = load i32, ptr %[[IA]]
+// CHECK-NEXT: %[[INC:.+]] = add nsw i32 %[[VA]], 1
+// CHECK-NEXT: store i32 %[[INC]], ptr %[[IA]], align 4
+// CHECK-NEXT: %[[IA2:.+]] = call ptr @llvm.threadlocal.address.p0(ptr @i)
+// CHECK-NEXT: %[[RET:.+]] = load i32, ptr %[[IA2]], align 4
+// CHECK-NEXT: ret i32 %[[RET]]
+//
+// CHECK: declare ptr @llvm.threadlocal.address.p0(ptr) #[[ATTR_NUM:.+]]
+//
+// CHECK-O1-LABEL: @_Z1gv
+// CHECK-O1-NEXT: entry:
+// CHECK-O1-NEXT:   %[[I_ADDR:.+]] = {{.*}}call ptr @llvm.threadlocal.address.p0(ptr nonnull @i)
+// CHECK-O1-NEXT:   %[[VAL:.+]] = load i32, ptr %[[I_ADDR]]
+// CHECK-O1-NEXT:   %[[INC:.+]] = add nsw i32 %[[VAL]], 1
+// CHECK-O1-NEXT:   store i32 %[[INC]], ptr %[[I_ADDR]]
+// CHECK-O1-NEXT:   ret i32 %[[INC]]
+//
+// CHECK-NOOPAQUE-LABEL: @_Z1gv
+// CHECK-NOOPAQUE-NEXT: entry:
+// CHECK-NOOPAQUE-NEXT:   %[[I_ADDR:.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @i)
+// CHECK-NOOPAQUE-NEXT:   %[[VAL:.+]] = load i32, i32* %[[I_ADDR]]
+// CHECK-NOOPAQUE-NEXT:   %[[INC:.+]] = add nsw i32 %[[VAL]], 1
+// CHECK-NOOPAQUE-NEXT:   store i32 %[[INC]], i32* %[[I_ADDR]]
+// CHECK-NOOPAQUE-NEXT:   %[[IA2:.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @i)
+// CHECK-NOOPAQUE-NEXT:   %[[RET:.+]] = load i32, i32* %[[IA2]], align 4
+// CHECK-NOOPAQUE-NEXT:   ret i32 %[[RET]]
+int f() {
+  thread_local int j = 0;
+  j++;
+  return j;
+}
+// CHECK: @_Z1fv()
+// CHECK-NEXT: entry
+// CHECK-NEXT: %[[JA:.+]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZZ1fvE1j)
+// CHECK-NEXT: %[[VA:.+]] = load i32, ptr %[[JA]]
+// CHECK-NEXT: %[[INC:.+]] = add nsw i32 %[[VA]], 1
+// CHECK-NEXT: store i32 %[[INC]], ptr %[[JA]], align 4
+// CHECK-NEXT: %[[JA2:.+]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZZ1fvE1j)
+// CHECK-NEXT: %[[RET:.+]] = load i32, ptr %[[JA2]], align 4
+// CHECK-NEXT: ret i32 %[[RET]]
+//
+// CHECK-O1-LABEL: @_Z1fv
+// CHECK-O1-NEXT: entry:
+// CHECK-O1-NEXT:   %[[J_ADDR:.+]] = {{.*}}call ptr @llvm.threadlocal.address.p0(ptr nonnull @_ZZ1fvE1j)
+// CHECK-O1-NEXT:   %[[VAL:.+]] = load i32, ptr %[[J_ADDR]]
+// CHECK-O1-NEXT:   %[[INC:.+]] = add nsw i32 %[[VAL]], 1
+// CHECK-O1-NEXT:   store i32 %[[INC]], ptr %[[J_ADDR]]
+// CHECK-O1-NEXT:   ret i32 %[[INC]]
+//
+// CHECK-NOOPAQUE: @_Z1fv()
+// CHECK-NOOPAQUE-NEXT: entry
+// CHECK-NOOPAQUE-NEXT: %[[JA:.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ1fvE1j)
+// CHECK-NOOPAQUE-NEXT: %[[VA:.+]] = load i32, i32* %[[JA]]
+// CHECK-NOOPAQUE-NEXT: %[[INC:.+]] = add nsw i32 %[[VA]], 1
+// CHECK-NOOPAQUE-NEXT: store i32 %[[INC]], i32* %[[JA]], align 4
+// CHECK-NOOPAQUE-NEXT: %[[JA2:.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ1fvE1j)
+// CHECK-NOOPAQUE-NEXT: %[[RET:.+]] = load i32, i32* %[[JA2]], align 4
+// CHECK-NOOPAQUE-NEXT: ret i32 %[[RET]]
+//
+// CHECK: attributes #[[ATTR_NUM]] = { nocallback nofree nosync nounwind readnone speculatable willreturn }

diff  --git a/clang/test/Modules/initializers.cpp b/clang/test/Modules/initializers.cpp
index bb43aa0bc78e0..06d7108136185 100644
--- a/clang/test/Modules/initializers.cpp
+++ b/clang/test/Modules/initializers.cpp
@@ -174,11 +174,13 @@ inline void use(bool b, ...) {
 
 // CHECK: define {{.*}} @[[G_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[G]],
+// CHECK: [[G_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[G]])
+// CHECK: store {{.*}}, i32* [[G_ADDR]]
 
 // CHECK: define {{.*}} @[[H_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[H]],
+// CHECK: [[H_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[H]])
+// CHECK: store {{.*}}, i32* [[H_ADDR]],
 
 // FIXME: Should this use __cxa_guard_acquire?
 // CHECK: define {{.*}} @[[XA_INIT]]()
@@ -187,7 +189,8 @@ inline void use(bool b, ...) {
 
 // CHECK: define {{.*}} @[[XC_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[XC]],
+// CHECK: [[XC_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[XC]])
+// CHECK: store {{.*}}, i32* [[XC_ADDR]],
 
 // FIXME: Should this use __cxa_guard_acquire?
 // CHECK: define {{.*}} @[[XE_INIT]]()
@@ -196,11 +199,13 @@ inline void use(bool b, ...) {
 
 // CHECK: define {{.*}} @[[XG_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[XG]],
+// CHECK: [[XG_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[XG]])
+// CHECK: store {{.*}}, i32* [[XG_ADDR]],
 
 // CHECK: define {{.*}} @[[XH_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[XH]],
+// CHECK: [[XH_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[XH]])
+// CHECK: store {{.*}}, i32* [[XH_ADDR]],
 
 // FIXME: Should this use __cxa_guard_acquire?
 // CHECK: define {{.*}} @[[XF_INIT]]()
@@ -209,7 +214,8 @@ inline void use(bool b, ...) {
 
 // CHECK: define {{.*}} @[[XD_INIT:__cxx_global.*]]()
 // CHECK: load {{.*}} (i64* @_ZGV
-// CHECK: store {{.*}}, i32* @[[XD]],
+// CHECK: [[XD_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[XD]])
+// CHECK: store {{.*}}, i32* [[XD_ADDR]],
 
 // FIXME: Should this use __cxa_guard_acquire?
 // CHECK: define {{.*}} @[[XB_INIT]]()
@@ -226,11 +232,13 @@ inline void use(bool b, ...) {
 
 // CHECK-IMPORT: define {{.*}} @[[C_INIT:__cxx_global.*]]()
 // CHECK-IMPORT: call noundef i32 @_Z11non_trivialv(
-// CHECK-IMPORT: store {{.*}}, i32* @[[C]],
+// CHECK-IMPORT: [[C_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[C]])
+// CHECK-IMPORT: store {{.*}}, i32* [[C_ADDR]],
 
 // CHECK-IMPORT: define {{.*}} @[[D_INIT:__cxx_global.*]]()
 // CHECK-IMPORT: load {{.*}} (i64* @_ZGV
-// CHECK-IMPORT: store {{.*}}, i32* @[[D]],
+// CHECK-IMPORT: [[D_ADDR:%.+]] = call i32* @llvm.threadlocal.address.p0i32(i32* @[[D]])
+// CHECK-IMPORT: store {{.*}}, i32* [[D_ADDR]],
 
 
 // CHECK-IMPORT: define {{.*}} @[[TU_INIT]]()

diff  --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
index adefb908f249d..56fa85f08b352 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i64 [[TMP1]], i8** [[TMP2]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i64 [[TMP1]], i8** [[TMP2]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    ret i32 0
 //
 //
@@ -463,15 +463,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -484,32 +485,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -562,7 +564,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp
index cf1c1bcf8a529..64eec60992ca2 100644
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@@ -372,15 +372,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -393,32 +394,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -471,7 +473,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/parallel_copyin_codegen.cpp b/clang/test/OpenMP/parallel_copyin_codegen.cpp
index 40f338b36bc30..4981f1778bd48 100644
--- a/clang/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/clang/test/OpenMP/parallel_copyin_codegen.cpp
@@ -1106,13 +1106,18 @@ void foo() {
 // CHECK11-NEXT:    store i8 1, i8* @_ZGVZ4mainE3var, align 1
 // CHECK11-NEXT:    br label [[INIT_END3]]
 // CHECK11:       init.end3:
-// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, [2 x i32]*, [2 x %struct.S]*, %struct.S*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* @_ZZ4mainE5t_var, [2 x i32]* @_ZZ4mainE3vec, [2 x %struct.S]* @_ZZ4mainE5s_arr, %struct.S* @_ZZ4mainE3var)
-// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP4:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP5:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ4mainE3vec)
+// CHECK11-NEXT:    [[TMP6:%.*]] = call [2 x %struct.S]* @llvm.threadlocal.address.p0a2s_struct.Ss([2 x %struct.S]* @_ZZ4mainE5s_arr)
+// CHECK11-NEXT:    [[TMP7:%.*]] = call %struct.S* @llvm.threadlocal.address.p0s_struct.Ss(%struct.S* @_ZZ4mainE3var)
+// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, [2 x i32]*, [2 x %struct.S]*, %struct.S*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[TMP4]], [2 x i32]* [[TMP5]], [2 x %struct.S]* [[TMP6]], %struct.S* [[TMP7]])
+// CHECK11-NEXT:    [[TMP8:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* [[TMP8]])
 // CHECK11-NEXT:    [[CALL4:%.*]] = call noundef i32 @_Z5tmainIiET_v()
 // CHECK11-NEXT:    store i32 [[CALL4]], i32* [[RETVAL]], align 4
 // CHECK11-NEXT:    call void @_ZN1SIfED1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
-// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RETVAL]], align 4
-// CHECK11-NEXT:    ret i32 [[TMP4]]
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK11-NEXT:    ret i32 [[TMP9]]
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
@@ -1194,34 +1199,49 @@ void foo() {
 // CHECK11-NEXT:    [[TMP1:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP2:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[S_ARR_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP3:%.*]] = load %struct.S*, %struct.S** [[VAR_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP4:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK11-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], ptrtoint (i32* @_ZZ4mainE5t_var to i64)
-// CHECK11-NEXT:    br i1 [[TMP5]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK11-NEXT:    [[TMP4:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP5:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK11-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[TMP4]] to i64
+// CHECK11-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP5]], [[TMP6]]
+// CHECK11-NEXT:    br i1 [[TMP7]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK11:       copyin.not.master:
-// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP0]], align 4
-// CHECK11-NEXT:    store i32 [[TMP6]], i32* @_ZZ4mainE5t_var, align 4
-// CHECK11-NEXT:    [[TMP7:%.*]] = bitcast [2 x i32]* [[TMP1]] to i8*
-// CHECK11-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 bitcast ([2 x i32]* @_ZZ4mainE3vec to i8*), i8* align 4 [[TMP7]], i64 8, i1 false)
-// CHECK11-NEXT:    [[TMP8:%.*]] = bitcast [2 x %struct.S]* [[TMP2]] to %struct.S*
-// CHECK11-NEXT:    br i1 icmp eq (%struct.S* getelementptr inbounds ([2 x %struct.S], [2 x %struct.S]* @_ZZ4mainE5s_arr, i32 0, i32 0), %struct.S* getelementptr inbounds ([2 x %struct.S], [2 x %struct.S]* @_ZZ4mainE5s_arr, i64 1, i64 0)), label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 [[TMP8]], i32* [[TMP4]], align 4
+// CHECK11-NEXT:    [[TMP9:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ4mainE3vec)
+// CHECK11-NEXT:    [[TMP10:%.*]] = bitcast [2 x i32]* [[TMP9]] to i8*
+// CHECK11-NEXT:    [[TMP11:%.*]] = bitcast [2 x i32]* [[TMP1]] to i8*
+// CHECK11-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 8, i1 false)
+// CHECK11-NEXT:    [[TMP12:%.*]] = call [2 x %struct.S]* @llvm.threadlocal.address.p0a2s_struct.Ss([2 x %struct.S]* @_ZZ4mainE5s_arr)
+// CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP12]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S]* [[TMP2]] to %struct.S*
+// CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK11:       omp.arraycpy.body:
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP8]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ getelementptr inbounds ([2 x %struct.S], [2 x %struct.S]* @_ZZ4mainE5s_arr, i32 0, i32 0), [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP13]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S* @_ZN1SIfEaSERKS0_(%struct.S* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]])
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S:%.*]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK11-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], getelementptr inbounds ([2 x %struct.S], [2 x %struct.S]* @_ZZ4mainE5s_arr, i64 1, i64 0)
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
 // CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK11:       omp.arraycpy.done1:
-// CHECK11-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S* @_ZN1SIfEaSERKS0_(%struct.S* noundef nonnull align 4 dereferenceable(4) @_ZZ4mainE3var, %struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP3]])
+// CHECK11-NEXT:    [[TMP15:%.*]] = call %struct.S* @llvm.threadlocal.address.p0s_struct.Ss(%struct.S* @_ZZ4mainE3var)
+// CHECK11-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S* @_ZN1SIfEaSERKS0_(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP15]], %struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP3]])
 // CHECK11-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK11:       copyin.not.master.end:
-// CHECK11-NEXT:    [[TMP9:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
-// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP10]])
-// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ4mainE5t_var, align 4
-// CHECK11-NEXT:    store i32 [[TMP11]], i32* getelementptr inbounds ([2 x i32], [2 x i32]* @_ZZ4mainE3vec, i64 0, i64 0), align 4
-// CHECK11-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S* @_ZN1SIfEaSERKS0_(%struct.S* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x %struct.S], [2 x %struct.S]* @_ZZ4mainE5s_arr, i64 0, i64 0), %struct.S* noundef nonnull align 4 dereferenceable(4) @_ZZ4mainE3var)
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP17]])
+// CHECK11-NEXT:    [[TMP18:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ4mainE3vec)
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP20]], i64 0, i64 0
+// CHECK11-NEXT:    store i32 [[TMP19]], i32* [[ARRAYIDX]], align 4
+// CHECK11-NEXT:    [[TMP21:%.*]] = call %struct.S* @llvm.threadlocal.address.p0s_struct.Ss(%struct.S* @_ZZ4mainE3var)
+// CHECK11-NEXT:    [[TMP22:%.*]] = call [2 x %struct.S]* @llvm.threadlocal.address.p0a2s_struct.Ss([2 x %struct.S]* @_ZZ4mainE5s_arr)
+// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP22]], i64 0, i64 0
+// CHECK11-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S* @_ZN1SIfEaSERKS0_(%struct.S* noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX3]], %struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP21]])
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1235,20 +1255,23 @@ void foo() {
 // CHECK11-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK11-NEXT:    store i32* [[T_VAR]], i32** [[T_VAR_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[T_VAR_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK11-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], ptrtoint (i32* @_ZZ4mainE5t_var to i64)
-// CHECK11-NEXT:    br i1 [[TMP2]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK11-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP1]] to i64
+// CHECK11-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+// CHECK11-NEXT:    br i1 [[TMP4]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK11:       copyin.not.master:
-// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
-// CHECK11-NEXT:    store i32 [[TMP3]], i32* @_ZZ4mainE5t_var, align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 4
 // CHECK11-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK11:       copyin.not.master.end:
-// CHECK11-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]])
-// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, i32* @_ZZ4mainE5t_var, align 4
-// CHECK11-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK11-NEXT:    store i32 [[INC]], i32* @_ZZ4mainE5t_var, align 4
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]])
+// CHECK11-NEXT:    [[TMP8:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ4mainE5t_var)
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+// CHECK11-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK11-NEXT:    store i32 [[INC]], i32* [[TMP8]], align 4
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1280,8 +1303,13 @@ void foo() {
 // CHECK11-NEXT:    store i8 1, i8* bitcast (i64* @_ZGVZ5tmainIiET_vE3var to i8*), align 8
 // CHECK11-NEXT:    br label [[INIT_END3]]
 // CHECK11:       init.end3:
-// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, [2 x i32]*, [2 x %struct.S.0]*, %struct.S.0*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* @_ZZ5tmainIiET_vE5t_var, [2 x i32]* @_ZZ5tmainIiET_vE3vec, [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, %struct.S.0* @_ZZ5tmainIiET_vE3var)
-// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    [[TMP4:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    [[TMP5:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ5tmainIiET_vE3vec)
+// CHECK11-NEXT:    [[TMP6:%.*]] = call [2 x %struct.S.0]* @llvm.threadlocal.address.p0a2s_struct.S.0s([2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr)
+// CHECK11-NEXT:    [[TMP7:%.*]] = call %struct.S.0* @llvm.threadlocal.address.p0s_struct.S.0s(%struct.S.0* @_ZZ5tmainIiET_vE3var)
+// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, [2 x i32]*, [2 x %struct.S.0]*, %struct.S.0*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* [[TMP4]], [2 x i32]* [[TMP5]], [2 x %struct.S.0]* [[TMP6]], %struct.S.0* [[TMP7]])
+// CHECK11-NEXT:    [[TMP8:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* [[TMP8]])
 // CHECK11-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret i32 0
 //
@@ -1293,8 +1321,9 @@ void foo() {
 // CHECK11-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP0:%.*]] = load volatile i32, i32* @g, align 128
-// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK11-NEXT:    [[TMP1:%.*]] = load volatile i32, i32* [[TMP0]], align 128
+// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
 // CHECK11-NEXT:    store float [[CONV]], float* [[F]], align 4
 // CHECK11-NEXT:    ret void
 //
@@ -1318,8 +1347,9 @@ void foo() {
 // CHECK11-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP0:%.*]] = load float, float* [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load volatile i32, i32* @g, align 128
-// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK11-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK11-NEXT:    [[TMP2:%.*]] = load volatile i32, i32* [[TMP1]], align 128
+// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP2]] to float
 // CHECK11-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
 // CHECK11-NEXT:    store float [[ADD]], float* [[F]], align 4
 // CHECK11-NEXT:    ret void
@@ -1404,34 +1434,49 @@ void foo() {
 // CHECK11-NEXT:    [[TMP1:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP2:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[S_ARR_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP3:%.*]] = load %struct.S.0*, %struct.S.0** [[VAR_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP4:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK11-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], ptrtoint (i32* @_ZZ5tmainIiET_vE5t_var to i64)
-// CHECK11-NEXT:    br i1 [[TMP5]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK11-NEXT:    [[TMP4:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    [[TMP5:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK11-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[TMP4]] to i64
+// CHECK11-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP5]], [[TMP6]]
+// CHECK11-NEXT:    br i1 [[TMP7]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK11:       copyin.not.master:
-// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP6]], i32* @_ZZ5tmainIiET_vE5t_var, align 128
-// CHECK11-NEXT:    [[TMP7:%.*]] = bitcast [2 x i32]* [[TMP1]] to i8*
-// CHECK11-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 bitcast ([2 x i32]* @_ZZ5tmainIiET_vE3vec to i8*), i8* align 128 [[TMP7]], i64 8, i1 false)
-// CHECK11-NEXT:    [[TMP8:%.*]] = bitcast [2 x %struct.S.0]* [[TMP2]] to %struct.S.0*
-// CHECK11-NEXT:    br i1 icmp eq (%struct.S.0* getelementptr inbounds ([2 x %struct.S.0], [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, i32 0, i32 0), %struct.S.0* getelementptr inbounds ([2 x %struct.S.0], [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, i64 1, i64 0)), label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 128
+// CHECK11-NEXT:    store i32 [[TMP8]], i32* [[TMP4]], align 128
+// CHECK11-NEXT:    [[TMP9:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ5tmainIiET_vE3vec)
+// CHECK11-NEXT:    [[TMP10:%.*]] = bitcast [2 x i32]* [[TMP9]] to i8*
+// CHECK11-NEXT:    [[TMP11:%.*]] = bitcast [2 x i32]* [[TMP1]] to i8*
+// CHECK11-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[TMP10]], i8* align 128 [[TMP11]], i64 8, i1 false)
+// CHECK11-NEXT:    [[TMP12:%.*]] = call [2 x %struct.S.0]* @llvm.threadlocal.address.p0a2s_struct.S.0s([2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr)
+// CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP12]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S.0]* [[TMP2]] to %struct.S.0*
+// CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S.0* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK11:       omp.arraycpy.body:
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP8]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ getelementptr inbounds ([2 x %struct.S.0], [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, i32 0, i32 0), [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP13]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S.0* @_ZN1SIiEaSERKS0_(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]])
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK11-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], getelementptr inbounds ([2 x %struct.S.0], [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, i64 1, i64 0)
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
 // CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK11:       omp.arraycpy.done1:
-// CHECK11-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S.0* @_ZN1SIiEaSERKS0_(%struct.S.0* noundef nonnull align 4 dereferenceable(4) @_ZZ5tmainIiET_vE3var, %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP3]])
+// CHECK11-NEXT:    [[TMP15:%.*]] = call %struct.S.0* @llvm.threadlocal.address.p0s_struct.S.0s(%struct.S.0* @_ZZ5tmainIiET_vE3var)
+// CHECK11-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S.0* @_ZN1SIiEaSERKS0_(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP15]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP3]])
 // CHECK11-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK11:       copyin.not.master.end:
-// CHECK11-NEXT:    [[TMP9:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
-// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP10]])
-// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ5tmainIiET_vE5t_var, align 128
-// CHECK11-NEXT:    store i32 [[TMP11]], i32* getelementptr inbounds ([2 x i32], [2 x i32]* @_ZZ5tmainIiET_vE3vec, i64 0, i64 0), align 128
-// CHECK11-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S.0* @_ZN1SIiEaSERKS0_(%struct.S.0* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x %struct.S.0], [2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr, i64 0, i64 0), %struct.S.0* noundef nonnull align 4 dereferenceable(4) @_ZZ5tmainIiET_vE3var)
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]])
+// CHECK11-NEXT:    [[TMP18:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 128
+// CHECK11-NEXT:    [[TMP20:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ5tmainIiET_vE3vec)
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP20]], i64 0, i64 0
+// CHECK11-NEXT:    store i32 [[TMP19]], i32* [[ARRAYIDX]], align 128
+// CHECK11-NEXT:    [[TMP21:%.*]] = call %struct.S.0* @llvm.threadlocal.address.p0s_struct.S.0s(%struct.S.0* @_ZZ5tmainIiET_vE3var)
+// CHECK11-NEXT:    [[TMP22:%.*]] = call [2 x %struct.S.0]* @llvm.threadlocal.address.p0a2s_struct.S.0s([2 x %struct.S.0]* @_ZZ5tmainIiET_vE5s_arr)
+// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP22]], i64 0, i64 0
+// CHECK11-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) %struct.S.0* @_ZN1SIiEaSERKS0_(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX3]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP21]])
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1445,17 +1490,19 @@ void foo() {
 // CHECK11-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK11-NEXT:    store i32* [[T_VAR]], i32** [[T_VAR_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[T_VAR_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK11-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], ptrtoint (i32* @_ZZ5tmainIiET_vE5t_var to i64)
-// CHECK11-NEXT:    br i1 [[TMP2]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZZ5tmainIiET_vE5t_var)
+// CHECK11-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK11-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP1]] to i64
+// CHECK11-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+// CHECK11-NEXT:    br i1 [[TMP4]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK11:       copyin.not.master:
-// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP3]], i32* @_ZZ5tmainIiET_vE5t_var, align 128
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 128
+// CHECK11-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 128
 // CHECK11-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK11:       copyin.not.master.end:
-// CHECK11-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]])
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK11-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]])
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1466,8 +1513,9 @@ void foo() {
 // CHECK11-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP0:%.*]] = load volatile i32, i32* @g, align 128
-// CHECK11-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK11-NEXT:    [[TMP1:%.*]] = load volatile i32, i32* [[TMP0]], align 128
+// CHECK11-NEXT:    store i32 [[TMP1]], i32* [[F]], align 4
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1490,15 +1538,17 @@ void foo() {
 // CHECK11-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load volatile i32, i32* @g, align 128
-// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK11-NEXT:    [[TMP2:%.*]] = load volatile i32, i32* [[TMP1]], align 128
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
 // CHECK11-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZTW1g
-// CHECK11-SAME: () #[[ATTR8:[0-9]+]] comdat {
-// CHECK11-NEXT:    ret i32* @g
+// CHECK11-SAME: () #[[ATTR9:[0-9]+]] comdat {
+// CHECK11-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK11-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@main
@@ -1522,25 +1572,29 @@ void foo() {
 // CHECK13-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK13-NEXT:    store i32* [[G]], i32** [[G_ADDR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[G_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK13-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], ptrtoint (i32* @g to i64)
-// CHECK13-NEXT:    br i1 [[TMP2]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK13-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK13-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP1]] to i64
+// CHECK13-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+// CHECK13-NEXT:    br i1 [[TMP4]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK13:       copyin.not.master:
-// CHECK13-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 128
-// CHECK13-NEXT:    store volatile i32 [[TMP3]], i32* @g, align 128
+// CHECK13-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 128
+// CHECK13-NEXT:    store volatile i32 [[TMP5]], i32* [[TMP1]], align 128
 // CHECK13-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK13:       copyin.not.master.end:
-// CHECK13-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK13-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-// CHECK13-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP5]])
-// CHECK13-NEXT:    store volatile i32 1, i32* @g, align 128
+// CHECK13-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK13-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]])
+// CHECK13-NEXT:    [[TMP8:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK13-NEXT:    store volatile i32 1, i32* [[TMP8]], align 128
 // CHECK13-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZTW1g
-// CHECK13-SAME: () #[[ATTR5:[0-9]+]] comdat {
-// CHECK13-NEXT:    ret i32* @g
+// CHECK13-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK13-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK13-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK14-LABEL: define {{[^@]+}}@main
@@ -1562,7 +1616,8 @@ void foo() {
 // CHECK14-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // CHECK14-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
 // CHECK14-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
-// CHECK14-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* @g)
+// CHECK14-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK14-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[TMP0]])
 // CHECK14-NEXT:    ret void
 //
 //
@@ -1576,21 +1631,24 @@ void foo() {
 // CHECK14-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK14-NEXT:    store i32* [[G]], i32** [[G_ADDR]], align 8
 // CHECK14-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[G_ADDR]], align 8
-// CHECK14-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK14-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], ptrtoint (i32* @g to i64)
-// CHECK14-NEXT:    br i1 [[TMP2]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK14-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK14-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK14-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP1]] to i64
+// CHECK14-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+// CHECK14-NEXT:    br i1 [[TMP4]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK14:       copyin.not.master:
-// CHECK14-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 128
-// CHECK14-NEXT:    store volatile i32 [[TMP3]], i32* @g, align 128
+// CHECK14-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 128
+// CHECK14-NEXT:    store volatile i32 [[TMP5]], i32* [[TMP1]], align 128
 // CHECK14-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK14:       copyin.not.master.end:
-// CHECK14-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK14-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-// CHECK14-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP5]])
-// CHECK14-NEXT:    store volatile i32 1, i32* @g, align 128
-// CHECK14-NEXT:    [[TMP6:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global.2 to %struct.__block_literal_generic*), i32 0, i32 3), align 8
-// CHECK14-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to void (i8*)*
-// CHECK14-NEXT:    call void [[TMP7]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global.2 to i8*))
+// CHECK14-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK14-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK14-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]])
+// CHECK14-NEXT:    [[TMP8:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK14-NEXT:    store volatile i32 1, i32* [[TMP8]], align 128
+// CHECK14-NEXT:    [[TMP9:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global.2 to %struct.__block_literal_generic*), i32 0, i32 3), align 8
+// CHECK14-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to void (i8*)*
+// CHECK14-NEXT:    call void [[TMP10]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global.2 to i8*))
 // CHECK14-NEXT:    ret void
 //
 //
@@ -1602,13 +1660,15 @@ void foo() {
 // CHECK14-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // CHECK14-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
 // CHECK14-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
-// CHECK14-NEXT:    store volatile i32 2, i32* @g, align 128
+// CHECK14-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK14-NEXT:    store volatile i32 2, i32* [[TMP0]], align 128
 // CHECK14-NEXT:    ret void
 //
 //
 // CHECK14-LABEL: define {{[^@]+}}@_ZTW1g
-// CHECK14-SAME: () #[[ATTR6:[0-9]+]] comdat {
-// CHECK14-NEXT:    ret i32* @g
+// CHECK14-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK14-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @g)
+// CHECK14-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_Z10array_funcv
@@ -1630,7 +1690,9 @@ void foo() {
 // CHECK15-NEXT:    store i8 1, i8* @_ZGVZ10array_funcvE1s, align 1
 // CHECK15-NEXT:    br label [[INIT_END]]
 // CHECK15:       init.end:
-// CHECK15-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, [2 x %struct.St]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [2 x i32]* @_ZZ10array_funcvE1a, [2 x %struct.St]* @_ZZ10array_funcvE1s)
+// CHECK15-NEXT:    [[TMP2:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ10array_funcvE1a)
+// CHECK15-NEXT:    [[TMP3:%.*]] = call [2 x %struct.St]* @llvm.threadlocal.address.p0a2s_struct.Sts([2 x %struct.St]* @_ZZ10array_funcvE1s)
+// CHECK15-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, [2 x %struct.St]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [2 x i32]* [[TMP2]], [2 x %struct.St]* [[TMP3]])
 // CHECK15-NEXT:    ret void
 //
 //
@@ -1683,28 +1745,35 @@ void foo() {
 // CHECK15-NEXT:    store [2 x %struct.St]* [[S]], [2 x %struct.St]** [[S_ADDR]], align 8
 // CHECK15-NEXT:    [[TMP0:%.*]] = load [2 x i32]*, [2 x i32]** [[A_ADDR]], align 8
 // CHECK15-NEXT:    [[TMP1:%.*]] = load [2 x %struct.St]*, [2 x %struct.St]** [[S_ADDR]], align 8
-// CHECK15-NEXT:    [[TMP2:%.*]] = ptrtoint [2 x i32]* [[TMP0]] to i64
-// CHECK15-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], ptrtoint ([2 x i32]* @_ZZ10array_funcvE1a to i64)
-// CHECK15-NEXT:    br i1 [[TMP3]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = call [2 x i32]* @llvm.threadlocal.address.p0a2i32([2 x i32]* @_ZZ10array_funcvE1a)
+// CHECK15-NEXT:    [[TMP3:%.*]] = ptrtoint [2 x i32]* [[TMP0]] to i64
+// CHECK15-NEXT:    [[TMP4:%.*]] = ptrtoint [2 x i32]* [[TMP2]] to i64
+// CHECK15-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+// CHECK15-NEXT:    br i1 [[TMP5]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK15:       copyin.not.master:
-// CHECK15-NEXT:    [[TMP4:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
-// CHECK15-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 bitcast ([2 x i32]* @_ZZ10array_funcvE1a to i8*), i8* align 4 [[TMP4]], i64 8, i1 false)
-// CHECK15-NEXT:    [[TMP5:%.*]] = bitcast [2 x %struct.St]* [[TMP1]] to %struct.St*
-// CHECK15-NEXT:    br i1 icmp eq (%struct.St* getelementptr inbounds ([2 x %struct.St], [2 x %struct.St]* @_ZZ10array_funcvE1s, i32 0, i32 0), %struct.St* getelementptr inbounds ([2 x %struct.St], [2 x %struct.St]* @_ZZ10array_funcvE1s, i64 1, i64 0)), label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK15-NEXT:    [[TMP6:%.*]] = bitcast [2 x i32]* [[TMP2]] to i8*
+// CHECK15-NEXT:    [[TMP7:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
+// CHECK15-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP6]], i8* align 4 [[TMP7]], i64 8, i1 false)
+// CHECK15-NEXT:    [[TMP8:%.*]] = call [2 x %struct.St]* @llvm.threadlocal.address.p0a2s_struct.Sts([2 x %struct.St]* @_ZZ10array_funcvE1s)
+// CHECK15-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.St], [2 x %struct.St]* [[TMP8]], i32 0, i32 0
+// CHECK15-NEXT:    [[TMP9:%.*]] = bitcast [2 x %struct.St]* [[TMP1]] to %struct.St*
+// CHECK15-NEXT:    [[TMP10:%.*]] = getelementptr [[STRUCT_ST:%.*]], %struct.St* [[ARRAY_BEGIN]], i64 2
+// CHECK15-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.St* [[ARRAY_BEGIN]], [[TMP10]]
+// CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK15:       omp.arraycpy.body:
-// CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.St* [ [[TMP5]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.St* [ getelementptr inbounds ([2 x %struct.St], [2 x %struct.St]* @_ZZ10array_funcvE1s, i32 0, i32 0), [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.St* [ [[TMP9]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.St* [ [[ARRAY_BEGIN]], [[COPYIN_NOT_MASTER]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(8) %struct.St* @_ZN2StaSERKS_(%struct.St* noundef nonnull align 4 dereferenceable(8) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.St* noundef nonnull align 4 dereferenceable(8) [[OMP_ARRAYCPY_SRCELEMENTPAST]])
-// CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_ST:%.*]], %struct.St* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_ST]], %struct.St* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_ST]], %struct.St* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.St* [[OMP_ARRAYCPY_DEST_ELEMENT]], getelementptr inbounds ([2 x %struct.St], [2 x %struct.St]* @_ZZ10array_funcvE1s, i64 1, i64 0)
+// CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.St* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP10]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done1:
 // CHECK15-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK15:       copyin.not.master.end:
-// CHECK15-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK15-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-// CHECK15-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]])
+// CHECK15-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK15-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK15-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP12]])
 // CHECK15-NEXT:    ret void
 //
 //
@@ -1745,19 +1814,20 @@ void foo() {
 // CHECK16-SAME: () #[[ATTR0:[0-9]+]] section ".text.startup" {
 // CHECK16-NEXT:  entry:
 // CHECK16-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z6t_initv()
-// CHECK16-NEXT:    store i32 [[CALL]], i32* @t, align 4
+// CHECK16-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @t)
+// CHECK16-NEXT:    store i32 [[CALL]], i32* [[TMP0]], align 4
 // CHECK16-NEXT:    ret void
 //
 //
 // CHECK16-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK16-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK16-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK16-NEXT:  entry:
 // CHECK16-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
 // CHECK16-NEXT:    ret void
 //
 //
 // CHECK16-LABEL: define {{[^@]+}}@.omp_outlined.
-// CHECK16-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK16-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK16-NEXT:  entry:
 // CHECK16-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 // CHECK16-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
@@ -1769,7 +1839,7 @@ void foo() {
 //
 //
 // CHECK16-LABEL: define {{[^@]+}}@.omp_outlined..1
-// CHECK16-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[T:%.*]]) #[[ATTR3]] {
+// CHECK16-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[T:%.*]]) #[[ATTR4]] {
 // CHECK16-NEXT:  entry:
 // CHECK16-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 // CHECK16-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
@@ -1799,9 +1869,10 @@ void foo() {
 //
 //
 // CHECK16-LABEL: define {{[^@]+}}@_ZTW1t
-// CHECK16-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK16-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK16-NEXT:    call void @_ZTH1t()
-// CHECK16-NEXT:    ret i32* @t
+// CHECK16-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @t)
+// CHECK16-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK16-LABEL: define {{[^@]+}}@__tls_init

diff  --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
index c616cfafb7c58..de96d1e5354c4 100644
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@@ -367,15 +367,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -388,32 +389,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -466,7 +468,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/parallel_master_codegen.cpp b/clang/test/OpenMP/parallel_master_codegen.cpp
index dc6db7cf22f96..1fb9e62066e11 100644
--- a/clang/test/OpenMP/parallel_master_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_codegen.cpp
@@ -629,7 +629,8 @@ void parallel_master_allocate() {
 // CHECK29-LABEL: define {{[^@]+}}@_Z22parallel_master_copyinv
 // CHECK29-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK29-NEXT:  entry:
-// CHECK29-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* @a)
+// CHECK29-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK29-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[TMP0]])
 // CHECK29-NEXT:    ret void
 //
 //
@@ -643,33 +644,37 @@ void parallel_master_allocate() {
 // CHECK29-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK29-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK29-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
-// CHECK29-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[TMP0]] to i64
-// CHECK29-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], ptrtoint (i32* @a to i64)
-// CHECK29-NEXT:    br i1 [[TMP2]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
+// CHECK29-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK29-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[TMP0]] to i64
+// CHECK29-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP1]] to i64
+// CHECK29-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+// CHECK29-NEXT:    br i1 [[TMP4]], label [[COPYIN_NOT_MASTER:%.*]], label [[COPYIN_NOT_MASTER_END:%.*]]
 // CHECK29:       copyin.not.master:
-// CHECK29-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4
-// CHECK29-NEXT:    store i32 [[TMP3]], i32* @a, align 4
+// CHECK29-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK29-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 4
 // CHECK29-NEXT:    br label [[COPYIN_NOT_MASTER_END]]
 // CHECK29:       copyin.not.master.end:
-// CHECK29-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK29-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-// CHECK29-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP5]])
 // CHECK29-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK29-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-// CHECK29-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[TMP7]])
-// CHECK29-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
-// CHECK29-NEXT:    br i1 [[TMP9]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// CHECK29-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]])
+// CHECK29-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK29-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+// CHECK29-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[TMP9]])
+// CHECK29-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK29-NEXT:    br i1 [[TMP11]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
 // CHECK29:       omp_if.then:
-// CHECK29-NEXT:    [[TMP10:%.*]] = load i32, i32* @a, align 4
-// CHECK29-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK29-NEXT:    store i32 [[INC]], i32* @a, align 4
-// CHECK29-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[TMP7]])
+// CHECK29-NEXT:    [[TMP12:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK29-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+// CHECK29-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK29-NEXT:    store i32 [[INC]], i32* [[TMP12]], align 4
+// CHECK29-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[TMP9]])
 // CHECK29-NEXT:    br label [[OMP_IF_END]]
 // CHECK29:       omp_if.end:
 // CHECK29-NEXT:    ret void
 //
 //
 // CHECK29-LABEL: define {{[^@]+}}@_ZTW1a
-// CHECK29-SAME: () #[[ATTR4:[0-9]+]] comdat {
-// CHECK29-NEXT:    ret i32* @a
+// CHECK29-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK29-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @a)
+// CHECK29-NEXT:    ret i32* [[TMP1]]
 //

diff  --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index 9e3baa763f790..b430a08dba940 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -322,15 +322,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -343,32 +344,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -421,7 +423,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
index 819425ec29cb5..e3b62e5d88700 100644
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@@ -313,15 +313,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -334,32 +335,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -412,7 +414,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
index 035b72adc3bc8..02ac94aaaf3cc 100644
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@@ -355,15 +355,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -376,32 +377,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -454,7 +456,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 458bcbe47851b..788fc6cc1ce3d 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -731,15 +731,16 @@ int main()
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = bitcast i8** [[DOTADDR]] to double**
 // CHECK1-NEXT:    [[TMP3:%.*]] = load double*, double** [[TMP2]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP3]], i64 [[TMP4]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq double* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr double, double* [[TMP3]], i64 [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq double* [[TMP3]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi double* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store double 0.000000e+00, double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 8
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr double, double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq double* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq double* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -752,24 +753,25 @@ int main()
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i8** [[DOTADDR]] to double**
-// CHECK1-NEXT:    [[TMP4:%.*]] = load double*, double** [[TMP3]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8** [[DOTADDR1]] to double**
-// CHECK1-NEXT:    [[TMP6:%.*]] = load double*, double** [[TMP5]], align 8
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr double, double* [[TMP4]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq double* [[TMP4]], [[TMP7]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[DOTADDR]] to double**
+// CHECK1-NEXT:    [[TMP5:%.*]] = load double*, double** [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8** [[DOTADDR1]] to double**
+// CHECK1-NEXT:    [[TMP7:%.*]] = load double*, double** [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr double, double* [[TMP5]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq double* [[TMP5]], [[TMP8]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi double* [ [[TMP6]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi double* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load double, double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = load double, double* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 8
-// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi double* [ [[TMP7]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi double* [ [[TMP5]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load double, double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load double, double* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 8
+// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    store double [[ADD]], double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 8
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr double, double* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr double, double* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq double* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP7]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq double* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done2:
 // CHECK1-NEXT:    ret void
@@ -825,7 +827,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -836,7 +838,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void

diff  --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index 9258ea653afca..7c6fe1a12f1b3 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -360,15 +360,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -381,32 +382,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -459,7 +461,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
index 0176fa4266315..47b1d390c3704 100644
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@@ -43,7 +43,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK1-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8**, i8*** [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    ret i32 0
 //
 //
@@ -380,15 +380,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -401,32 +402,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -479,7 +481,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index 95b591365bb24..ee44362bc2b78 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -43,7 +43,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK1-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8**, i8*** [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    ret i32 0
 //
 //
@@ -326,15 +326,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -347,32 +348,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -425,7 +427,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index b48d34c301f50..5c804780f59ff 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -43,7 +43,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK1-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8**, i8*** [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i32* [[ARGC_ADDR]], i8** [[TMP0]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    ret i32 0
 //
 //
@@ -353,15 +353,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -374,25 +375,26 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
@@ -726,15 +728,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -747,32 +750,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -825,7 +829,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
index b64c9409d44e1..c5c6394254f8b 100644
--- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -186,12 +186,14 @@ sum = 0.0;
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
 
 // CHECK: define internal void @[[RED_FINI2]](i8* noundef %0)
-// CHECK: load i64, i64* [[RED_SIZE1]]
+// CHECK: [[RED_SIZE1_ADDR:%.+]] = call i64* @llvm.threadlocal.address.p0i64(i64* [[RED_SIZE1]]
+// CHECK: load i64, i64* [[RED_SIZE1_ADDR]]
 // CHECK: call void @
 // CHECK: ret void
 
 // CHECK: define internal void @[[RED_COMB2]](i8* noundef %0, i8* noundef %1)
-// CHECK: load i64, i64* [[RED_SIZE1]]
+// CHECK: [[RED_SIZE1_ADDR2:%.+]] = call i64* @llvm.threadlocal.address.p0i64(i64* [[RED_SIZE1]]
+// CHECK: load i64, i64* [[RED_SIZE1_ADDR2]]
 // CHECK: call void [[OMP_COMB1]](
 // CHECK: ret void
 
@@ -205,12 +207,14 @@ sum = 0.0;
 // CHECK: ret void
 
 // CHECK: define internal void @[[RED_INIT4]](i8* noalias noundef %{{.+}}, i8* noalias noundef %{{.+}})
-// CHECK: load i64, i64* [[RED_SIZE2]]
+// CHECK: [[RED_SIZE2_ADDR:%.+]] = call i64* @llvm.threadlocal.address.p0i64(i64* [[RED_SIZE2]]
+// CHECK: load i64, i64* [[RED_SIZE2_ADDR]]
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
 // CHECK: define internal void @[[RED_COMB4]](i8* noundef %0, i8* noundef %1)
-// CHECK: load i64, i64* [[RED_SIZE2]]
+// CHECK: [[RED_SIZE2_ADDR2:%.+]] = call i64* @llvm.threadlocal.address.p0i64(i64* [[RED_SIZE2]]
+// CHECK: load i64, i64* [[RED_SIZE2_ADDR2]]
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void

diff  --git a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
index d923207f7804d..80256bd85b211 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
@@ -103,52 +103,53 @@ int main() {
 // CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store i32 0, i32* [[RETVAL]], align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* @x, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[X_CASTED]] to i32*
-// CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[X_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
-// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
-// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
-// CHECK1-NEXT:    store i8* null, i8** [[TMP6]], align 8
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [2 x i32]**
-// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP8]], align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [2 x i32]**
-// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP10]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
-// CHECK1-NEXT:    store i8* null, i8** [[TMP11]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* [[X_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8** [[TMP5]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to [2 x i32]**
+// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to [2 x i32]**
+// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 1, i32* [[TMP14]], align 4
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 2, i32* [[TMP15]], align 4
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK1-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 1, i32* [[TMP15]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 2, i32* [[TMP16]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
 // CHECK1-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP18]], align 8
-// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP19]], align 8
-// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK1-NEXT:    store i8** null, i8*** [[TMP20]], align 8
-// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP14]], i8*** [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP20]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP21]], align 8
-// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 2, i64* [[TMP22]], align 8
-// CHECK1-NEXT:    [[TMP23:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
-// CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-// CHECK1-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK1-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP1]], [2 x i32]* [[A]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP2]], [2 x i32]* [[A]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -316,7 +317,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A:%.*]] = alloca [2 x i32], align 4
 // CHECK1-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
@@ -324,52 +325,53 @@ int main() {
 // CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
 // CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* @x, align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[X_CASTED]] to i32*
-// CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[X_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
-// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
-// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
-// CHECK1-NEXT:    store i8* null, i8** [[TMP6]], align 8
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [2 x i32]**
-// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP8]], align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [2 x i32]**
-// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP10]], align 8
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
-// CHECK1-NEXT:    store i8* null, i8** [[TMP11]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* [[X_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8** [[TMP5]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to [2 x i32]**
+// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to [2 x i32]**
+// CHECK1-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 1, i32* [[TMP14]], align 4
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 2, i32* [[TMP15]], align 4
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK1-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 1, i32* [[TMP15]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 2, i32* [[TMP16]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
 // CHECK1-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.4, i32 0, i32 0), i64** [[TMP18]], align 8
-// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.5, i32 0, i32 0), i64** [[TMP19]], align 8
-// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK1-NEXT:    store i8** null, i8*** [[TMP20]], align 8
-// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP14]], i8*** [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.4, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.5, i32 0, i32 0), i64** [[TMP20]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP21]], align 8
-// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 2, i64* [[TMP22]], align 8
-// CHECK1-NEXT:    [[TMP23:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
-// CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-// CHECK1-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK1-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i64 [[TMP1]], [2 x i32]* [[A]]) #[[ATTR2]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i64 [[TMP2]], [2 x i32]* [[A]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -536,12 +538,13 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZTW1x
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
-// CHECK1-NEXT:    ret i32* @x
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK1-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -558,51 +561,52 @@ int main() {
 // CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    store i32 0, i32* [[RETVAL]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* @x, align 4
-// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[X_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
-// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
-// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
-// CHECK3-NEXT:    store i8* null, i8** [[TMP6]], align 4
-// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [2 x i32]**
-// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP8]], align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [2 x i32]**
-// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP10]], align 4
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
-// CHECK3-NEXT:    store i8* null, i8** [[TMP11]], align 4
-// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[X_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[X_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8** [[TMP5]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to [2 x i32]**
+// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to [2 x i32]**
+// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 1, i32* [[TMP14]], align 4
-// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 2, i32* [[TMP15]], align 4
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK3-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
-// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 1, i32* [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 2, i32* [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
 // CHECK3-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP18]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP19]], align 4
-// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK3-NEXT:    store i8** null, i8*** [[TMP20]], align 4
-// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP14]], i8*** [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP21]], align 4
-// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 2, i64* [[TMP22]], align 8
-// CHECK3-NEXT:    [[TMP23:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
-// CHECK3-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-// CHECK3-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK3-NEXT:    [[TMP24:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK3-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP1]], [2 x i32]* [[A]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP2]], [2 x i32]* [[A]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -764,7 +768,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR3:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A:%.*]] = alloca [2 x i32], align 4
 // CHECK3-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
@@ -772,51 +776,52 @@ int main() {
 // CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
 // CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* @x, align 4
-// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[X_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
-// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
-// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
-// CHECK3-NEXT:    store i8* null, i8** [[TMP6]], align 4
-// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [2 x i32]**
-// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP8]], align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [2 x i32]**
-// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP10]], align 4
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
-// CHECK3-NEXT:    store i8* null, i8** [[TMP11]], align 4
-// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[X_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[X_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8** [[TMP5]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8** [[TMP8]] to [2 x i32]**
+// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to [2 x i32]**
+// CHECK3-NEXT:    store [2 x i32]* [[A]], [2 x i32]** [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 1, i32* [[TMP14]], align 4
-// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 2, i32* [[TMP15]], align 4
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK3-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
-// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 1, i32* [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 2, i32* [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
 // CHECK3-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.4, i32 0, i32 0), i64** [[TMP18]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.5, i32 0, i32 0), i64** [[TMP19]], align 4
-// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK3-NEXT:    store i8** null, i8*** [[TMP20]], align 4
-// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP14]], i8*** [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.4, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.5, i32 0, i32 0), i64** [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP21]], align 4
-// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 2, i64* [[TMP22]], align 8
-// CHECK3-NEXT:    [[TMP23:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
-// CHECK3-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-// CHECK3-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK3-NEXT:    [[TMP24:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 -1, i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK3-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i32 [[TMP1]], [2 x i32]* [[A]]) #[[ATTR2]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i32 [[TMP2]], [2 x i32]* [[A]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -977,12 +982,13 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_ZTW1x
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] comdat {
-// CHECK3-NEXT:    ret i32* @x
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK3-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1170,12 +1176,13 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZTW1x
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] comdat {
-// CHECK9-NEXT:    ret i32* @x
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK9-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @x)
+// CHECK9-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void

diff  --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
index 9e19165ac9a39..f4a9fab81bedb 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i64 [[TMP1]], i8** [[TMP2]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l14(i64 [[TMP1]], i8** [[TMP2]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    ret i32 0
 //
 //
@@ -359,15 +359,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -380,25 +381,26 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
@@ -732,15 +734,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq i8* [[TMP2]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
 // CHECK1:       omp.arrayinit.body:
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
 // CHECK1-NEXT:    store i8 0, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
 // CHECK1:       omp.arrayinit.done:
 // CHECK1-NEXT:    ret void
@@ -753,32 +756,33 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8*, align 8
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
 // CHECK1-NEXT:    store i8* [[TMP1]], i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* @{{reduction_size[.].+[.]}}, align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[TMP2]]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP3]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i64* @llvm.threadlocal.address.p0i64(i64* @{{reduction_size[.].+[.]}})
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP4]], i64 [[TMP3]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq i8* [[TMP4]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP3]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP6]] to i32
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
-// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi i8* [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi i8* [ [[TMP4]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], [[CONV2]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i8, i8* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq i8* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i8*** noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i8***, align 8
@@ -831,7 +835,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i8***)*
-// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void [[TMP15]](i8* [[TMP14]], i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR6]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8**, i8*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32*, i32** [[TMP17]], align 8

diff  --git a/clang/test/OpenMP/threadprivate_codegen.cpp b/clang/test/OpenMP/threadprivate_codegen.cpp
index 4f3ab40ad723d..ff3fec914c2e2 100644
--- a/clang/test/OpenMP/threadprivate_codegen.cpp
+++ b/clang/test/OpenMP/threadprivate_codegen.cpp
@@ -3662,57 +3662,62 @@ int foobar() {
 // CHECK-TLS1-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP4]], i32 0, i32 0
 // CHECK-TLS1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A1]], align 4
 // CHECK-TLS1-NEXT:    store i32 [[TMP5]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_SMAIN:%.*]], %struct.Smain* @_ZZ4mainE2sm, i32 0, i32 0), align 8
-// CHECK-TLS1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+// CHECK-TLS1-NEXT:    [[TMP6:%.*]] = call %struct.Smain* @llvm.threadlocal.address.p0s_struct.Smains(%struct.Smain* @_ZZ4mainE2sm)
+// CHECK-TLS1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[TMP6]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A2]], align 8
+// CHECK-TLS1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP7]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP8:%.*]] = call %struct.S1* @_ZTWL3gs1()
-// CHECK-TLS1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP8]], i32 0, i32 0
-// CHECK-TLS1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A2]], align 4
-// CHECK-TLS1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
-// CHECK-TLS1-NEXT:    store i32 [[ADD3]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8
-// CHECK-TLS1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK-TLS1-NEXT:    [[TMP9:%.*]] = call %struct.S1* @_ZTWL3gs1()
+// CHECK-TLS1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP9]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A3]], align 4
+// CHECK-TLS1-NEXT:    [[TMP11:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP13:%.*]] = call %struct.S5* @_ZTW3gs3()
-// CHECK-TLS1-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP13]], i32 0, i32 0
-// CHECK-TLS1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A5]], align 4
-// CHECK-TLS1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
-// CHECK-TLS1-NEXT:    store i32 [[ADD6]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP16:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x()
-// CHECK-TLS1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP16]], i64 0, i64 1
-// CHECK-TLS1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1
-// CHECK-TLS1-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX7]], i32 0, i32 0
-// CHECK-TLS1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[A8]], align 4
-// CHECK-TLS1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
-// CHECK-TLS1-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP19:%.*]] = load i32, i32* @_ZN2STIiE2stE, align 4
-// CHECK-TLS1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+// CHECK-TLS1-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8
+// CHECK-TLS1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+// CHECK-TLS1-NEXT:    store i32 [[ADD5]], i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[TMP14:%.*]] = call %struct.S5* @_ZTW3gs3()
+// CHECK-TLS1-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP14]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A6]], align 4
+// CHECK-TLS1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+// CHECK-TLS1-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[TMP17:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x()
+// CHECK-TLS1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP17]], i64 0, i64 1
+// CHECK-TLS1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1
+// CHECK-TLS1-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX8]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A9]], align 4
+// CHECK-TLS1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP19]], [[TMP18]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP21:%.*]] = load float, float* @_ZN2STIfE2stE, align 4
-// CHECK-TLS1-NEXT:    [[CONV:%.*]] = fptosi float [[TMP21]] to i32
+// CHECK-TLS1-NEXT:    [[TMP20:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE)
+// CHECK-TLS1-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
 // CHECK-TLS1-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[CONV]]
+// CHECK-TLS1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP23:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
-// CHECK-TLS1-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP23]], i32 0, i32 0
-// CHECK-TLS1-NEXT:    [[TMP24:%.*]] = load i32, i32* [[A12]], align 4
+// CHECK-TLS1-NEXT:    [[TMP23:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE)
+// CHECK-TLS1-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP23]], align 4
+// CHECK-TLS1-NEXT:    [[CONV:%.*]] = fptosi float [[TMP24]] to i32
 // CHECK-TLS1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP25]], [[TMP24]]
-// CHECK-TLS1-NEXT:    store i32 [[ADD13]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP26:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    ret i32 [[TMP26]]
+// CHECK-TLS1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP25]], [[CONV]]
+// CHECK-TLS1-NEXT:    store i32 [[ADD12]], i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[TMP26:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
+// CHECK-TLS1-NEXT:    [[A13:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP26]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP27:%.*]] = load i32, i32* [[A13]], align 4
+// CHECK-TLS1-NEXT:    [[TMP28:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
+// CHECK-TLS1-NEXT:    store i32 [[ADD14]], i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    ret i32 [[TMP29]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZTWL3gs1
 // CHECK-TLS1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK-TLS1-NEXT:    call void @_ZTHL3gs1()
-// CHECK-TLS1-NEXT:    ret %struct.S1* @_ZL3gs1
+// CHECK-TLS1-NEXT:    [[TMP1:%.*]] = call %struct.S1* @llvm.threadlocal.address.p0s_struct.S1s(%struct.S1* @_ZL3gs1)
+// CHECK-TLS1-NEXT:    ret %struct.S1* [[TMP1]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC1Ei
@@ -3745,7 +3750,8 @@ int foobar() {
 // CHECK-TLS1-NEXT:    call void @_ZTHN6Static1sE()
 // CHECK-TLS1-NEXT:    br label [[TMP2]]
 // CHECK-TLS1:       2:
-// CHECK-TLS1-NEXT:    ret %struct.S3* @_ZN6Static1sE
+// CHECK-TLS1-NEXT:    [[TMP3:%.*]] = call %struct.S3* @llvm.threadlocal.address.p0s_struct.S3s(%struct.S3* @_ZN6Static1sE)
+// CHECK-TLS1-NEXT:    ret %struct.S3* [[TMP3]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZTW3gs3
@@ -3755,19 +3761,22 @@ int foobar() {
 // CHECK-TLS1-NEXT:    call void @_ZTH3gs3()
 // CHECK-TLS1-NEXT:    br label [[TMP2]]
 // CHECK-TLS1:       2:
-// CHECK-TLS1-NEXT:    ret %struct.S5* @gs3
+// CHECK-TLS1-NEXT:    [[TMP3:%.*]] = call %struct.S5* @llvm.threadlocal.address.p0s_struct.S5s(%struct.S5* @gs3)
+// CHECK-TLS1-NEXT:    ret %struct.S5* [[TMP3]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZTW5arr_x
 // CHECK-TLS1-SAME: () #[[ATTR5]] comdat {
 // CHECK-TLS1-NEXT:    call void @_ZTH5arr_x()
-// CHECK-TLS1-NEXT:    ret [2 x [3 x %struct.S1]]* @arr_x
+// CHECK-TLS1-NEXT:    [[TMP1:%.*]] = call [2 x [3 x %struct.S1]]* @llvm.threadlocal.address.p0a2a3s_struct.S1s([2 x [3 x %struct.S1]]* @arr_x)
+// CHECK-TLS1-NEXT:    ret [2 x [3 x %struct.S1]]* [[TMP1]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZTWN2STI2S4E2stE
 // CHECK-TLS1-SAME: () #[[ATTR5]] comdat {
 // CHECK-TLS1-NEXT:    call void @_ZTHN2STI2S4E2stE()
-// CHECK-TLS1-NEXT:    ret %struct.S4* @_ZN2STI2S4E2stE
+// CHECK-TLS1-NEXT:    [[TMP1:%.*]] = call %struct.S4* @llvm.threadlocal.address.p0s_struct.S4s(%struct.S4* @_ZN2STI2S4E2stE)
+// CHECK-TLS1-NEXT:    ret %struct.S4* [[TMP1]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC2Ei
@@ -3796,7 +3805,7 @@ int foobar() {
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@_Z6foobarv
-// CHECK-TLS1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-TLS1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-TLS1-NEXT:  entry:
 // CHECK-TLS1-NEXT:    [[RES:%.*]] = alloca i32, align 4
 // CHECK-TLS1-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE()
@@ -3827,23 +3836,25 @@ int foobar() {
 // CHECK-TLS1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4
 // CHECK-TLS1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP13:%.*]] = load i32, i32* @_ZN2STIiE2stE, align 4
-// CHECK-TLS1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK-TLS1-NEXT:    [[TMP13:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE)
+// CHECK-TLS1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK-TLS1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD8]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP15:%.*]] = load float, float* @_ZN2STIfE2stE, align 4
-// CHECK-TLS1-NEXT:    [[CONV:%.*]] = fptosi float [[TMP15]] to i32
-// CHECK-TLS1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP16]], [[CONV]]
+// CHECK-TLS1-NEXT:    [[TMP16:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE)
+// CHECK-TLS1-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-TLS1-NEXT:    [[CONV:%.*]] = fptosi float [[TMP17]] to i32
+// CHECK-TLS1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[CONV]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP17:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
-// CHECK-TLS1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP17]], i32 0, i32 0
-// CHECK-TLS1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A10]], align 4
-// CHECK-TLS1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[TMP18]]
+// CHECK-TLS1-NEXT:    [[TMP19:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
+// CHECK-TLS1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP19]], i32 0, i32 0
+// CHECK-TLS1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[A10]], align 4
+// CHECK-TLS1-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP21]], [[TMP20]]
 // CHECK-TLS1-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS1-NEXT:    ret i32 [[TMP20]]
+// CHECK-TLS1-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS1-NEXT:    ret i32 [[TMP22]]
 //
 //
 // CHECK-TLS1-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
@@ -3953,59 +3964,62 @@ int foobar() {
 // CHECK-TLS2-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP4]], i32 0, i32 0
 // CHECK-TLS2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A1]], align 4
 // CHECK-TLS2-NEXT:    store i32 [[TMP5]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_SMAIN:%.*]], %struct.Smain* @_ZZ4mainE2sm, i32 0, i32 0), align 8
-// CHECK-TLS2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+// CHECK-TLS2-NEXT:    [[TMP6:%.*]] = call %struct.Smain* @llvm.threadlocal.address.p0s_struct.Smains(%struct.Smain* @_ZZ4mainE2sm)
+// CHECK-TLS2-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[TMP6]], i32 0, i32 0
+// CHECK-TLS2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A2]], align 8
+// CHECK-TLS2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP7]]
 // CHECK-TLS2-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP8:%.*]] = call %struct.S1* @_ZTWL3gs1()
-// CHECK-TLS2-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP8]], i32 0, i32 0
-// CHECK-TLS2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A2]], align 4
-// CHECK-TLS2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
-// CHECK-TLS2-NEXT:    store i32 [[ADD3]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8
-// CHECK-TLS2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK-TLS2-NEXT:    [[TMP9:%.*]] = call %struct.S1* @_ZTWL3gs1()
+// CHECK-TLS2-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP9]], i32 0, i32 0
+// CHECK-TLS2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A3]], align 4
+// CHECK-TLS2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
 // CHECK-TLS2-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP13:%.*]] = call %struct.S5* @_ZTW3gs3()
-// CHECK-TLS2-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP13]], i32 0, i32 0
-// CHECK-TLS2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A5]], align 4
-// CHECK-TLS2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
-// CHECK-TLS2-NEXT:    store i32 [[ADD6]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP16:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x()
-// CHECK-TLS2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP16]], i64 0, i64 1
-// CHECK-TLS2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1
-// CHECK-TLS2-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX7]], i32 0, i32 0
-// CHECK-TLS2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[A8]], align 4
-// CHECK-TLS2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
-// CHECK-TLS2-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP19:%.*]] = call i32* @_ZTWN2STIiE2stE()
-// CHECK-TLS2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
-// CHECK-TLS2-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], [[TMP20]]
+// CHECK-TLS2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8
+// CHECK-TLS2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+// CHECK-TLS2-NEXT:    store i32 [[ADD5]], i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[TMP14:%.*]] = call %struct.S5* @_ZTW3gs3()
+// CHECK-TLS2-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP14]], i32 0, i32 0
+// CHECK-TLS2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A6]], align 4
+// CHECK-TLS2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+// CHECK-TLS2-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[TMP17:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x()
+// CHECK-TLS2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP17]], i64 0, i64 1
+// CHECK-TLS2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1
+// CHECK-TLS2-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX8]], i32 0, i32 0
+// CHECK-TLS2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A9]], align 4
+// CHECK-TLS2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP19]], [[TMP18]]
 // CHECK-TLS2-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP22:%.*]] = call float* @_ZTWN2STIfE2stE()
-// CHECK-TLS2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
-// CHECK-TLS2-NEXT:    [[CONV:%.*]] = fptosi float [[TMP23]] to i32
-// CHECK-TLS2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[CONV]]
+// CHECK-TLS2-NEXT:    [[TMP20:%.*]] = call i32* @_ZTWN2STIiE2stE()
+// CHECK-TLS2-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
+// CHECK-TLS2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
 // CHECK-TLS2-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[TMP25:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
-// CHECK-TLS2-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP25]], i32 0, i32 0
-// CHECK-TLS2-NEXT:    [[TMP26:%.*]] = load i32, i32* [[A12]], align 4
-// CHECK-TLS2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-// CHECK-TLS2-NEXT:    store i32 [[ADD13]], i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[TMP23:%.*]] = call float* @_ZTWN2STIfE2stE()
+// CHECK-TLS2-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP23]], align 4
+// CHECK-TLS2-NEXT:    [[CONV:%.*]] = fptosi float [[TMP24]] to i32
+// CHECK-TLS2-NEXT:    [[TMP25:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP25]], [[CONV]]
+// CHECK-TLS2-NEXT:    store i32 [[ADD12]], i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[TMP26:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE()
+// CHECK-TLS2-NEXT:    [[A13:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP26]], i32 0, i32 0
+// CHECK-TLS2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[A13]], align 4
 // CHECK-TLS2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[RES]], align 4
-// CHECK-TLS2-NEXT:    ret i32 [[TMP28]]
+// CHECK-TLS2-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
+// CHECK-TLS2-NEXT:    store i32 [[ADD14]], i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[RES]], align 4
+// CHECK-TLS2-NEXT:    ret i32 [[TMP29]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTWL3gs1
 // CHECK-TLS2-SAME: () #[[ATTR1:[0-9]+]] {
 // CHECK-TLS2-NEXT:    call void @_ZTHL3gs1()
-// CHECK-TLS2-NEXT:    ret %struct.S1* @_ZL3gs1
+// CHECK-TLS2-NEXT:    [[TMP1:%.*]] = call %struct.S1* @llvm.threadlocal.address.p0s_struct.S1s(%struct.S1* @_ZL3gs1)
+// CHECK-TLS2-NEXT:    ret %struct.S1* [[TMP1]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC1Ei
@@ -4038,7 +4052,8 @@ int foobar() {
 // CHECK-TLS2-NEXT:    call void @_ZTHN6Static1sE()
 // CHECK-TLS2-NEXT:    br label [[TMP2]]
 // CHECK-TLS2:       2:
-// CHECK-TLS2-NEXT:    ret %struct.S3* @_ZN6Static1sE
+// CHECK-TLS2-NEXT:    [[TMP3:%.*]] = call %struct.S3* @llvm.threadlocal.address.p0s_struct.S3s(%struct.S3* @_ZN6Static1sE)
+// CHECK-TLS2-NEXT:    ret %struct.S3* [[TMP3]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTW3gs3
@@ -4048,33 +4063,38 @@ int foobar() {
 // CHECK-TLS2-NEXT:    call void @_ZTH3gs3()
 // CHECK-TLS2-NEXT:    br label [[TMP2]]
 // CHECK-TLS2:       2:
-// CHECK-TLS2-NEXT:    ret %struct.S5* @gs3
+// CHECK-TLS2-NEXT:    [[TMP3:%.*]] = call %struct.S5* @llvm.threadlocal.address.p0s_struct.S5s(%struct.S5* @gs3)
+// CHECK-TLS2-NEXT:    ret %struct.S5* [[TMP3]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTW5arr_x
 // CHECK-TLS2-SAME: () #[[ATTR1]] comdat {
 // CHECK-TLS2-NEXT:    call void @_ZTH5arr_x()
-// CHECK-TLS2-NEXT:    ret [2 x [3 x %struct.S1]]* @arr_x
+// CHECK-TLS2-NEXT:    [[TMP1:%.*]] = call [2 x [3 x %struct.S1]]* @llvm.threadlocal.address.p0a2a3s_struct.S1s([2 x [3 x %struct.S1]]* @arr_x)
+// CHECK-TLS2-NEXT:    ret [2 x [3 x %struct.S1]]* [[TMP1]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTWN2STIiE2stE
 // CHECK-TLS2-SAME: () #[[ATTR1]] comdat {
-// CHECK-TLS2-NEXT:    ret i32* @_ZN2STIiE2stE
+// CHECK-TLS2-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE)
+// CHECK-TLS2-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTWN2STIfE2stE
 // CHECK-TLS2-SAME: () #[[ATTR1]] comdat {
-// CHECK-TLS2-NEXT:    ret float* @_ZN2STIfE2stE
+// CHECK-TLS2-NEXT:    [[TMP1:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE)
+// CHECK-TLS2-NEXT:    ret float* [[TMP1]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_ZTWN2STI2S4E2stE
 // CHECK-TLS2-SAME: () #[[ATTR1]] comdat {
 // CHECK-TLS2-NEXT:    call void @_ZTHN2STI2S4E2stE()
-// CHECK-TLS2-NEXT:    ret %struct.S4* @_ZN2STI2S4E2stE
+// CHECK-TLS2-NEXT:    [[TMP1:%.*]] = call %struct.S4* @llvm.threadlocal.address.p0s_struct.S4s(%struct.S4* @_ZN2STI2S4E2stE)
+// CHECK-TLS2-NEXT:    ret %struct.S4* [[TMP1]]
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_Z6foobarv
-// CHECK-TLS2-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-TLS2-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    [[RES:%.*]] = alloca i32, align 4
 // CHECK-TLS2-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE()
@@ -4127,7 +4147,7 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK-TLS2-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-TLS2-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    call void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) @_ZL3gs1, i32 noundef 5)
 // CHECK-TLS2-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S1*)* @_ZN2S1D1Ev to void (i8*)*), i8* bitcast (%struct.S1* @_ZL3gs1 to i8*), i8* @__dso_handle) #[[ATTR4]]
@@ -4183,7 +4203,7 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
-// CHECK-TLS2-SAME: () #[[ATTR6]] {
+// CHECK-TLS2-SAME: () #[[ATTR7]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    call void @_ZN2S2C1Ei(%struct.S2* noundef nonnull align 8 dereferenceable(16) @_ZL3gs2, i32 noundef 27)
 // CHECK-TLS2-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.S2*)* @_ZN2S2D1Ev to void (i8*)*), i8* bitcast (%struct.S2* @_ZL3gs2 to i8*), i8* @__dso_handle) #[[ATTR4]]
@@ -4239,7 +4259,7 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
-// CHECK-TLS2-SAME: () #[[ATTR6]] personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+// CHECK-TLS2-SAME: () #[[ATTR7]] personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca [3 x %struct.S1]*, align 8
 // CHECK-TLS2-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca %struct.S1*, align 8
@@ -4332,7 +4352,7 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
-// CHECK-TLS2-SAME: (i8* noundef [[TMP0:%.*]]) #[[ATTR6]] {
+// CHECK-TLS2-SAME: (i8* noundef [[TMP0:%.*]]) #[[ATTR7]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
 // CHECK-TLS2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
@@ -4373,7 +4393,7 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
-// CHECK-TLS2-SAME: () #[[ATTR6]] {
+// CHECK-TLS2-SAME: () #[[ATTR7]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    [[TMP0:%.*]] = load i8, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8
 // CHECK-TLS2-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0
@@ -4436,14 +4456,14 @@ int foobar() {
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_threadprivate_codegen.cpp
-// CHECK-TLS2-SAME: () #[[ATTR6]] {
+// CHECK-TLS2-SAME: () #[[ATTR7]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    call void @__cxx_global_var_init.1()
 // CHECK-TLS2-NEXT:    ret void
 //
 //
 // CHECK-TLS2-LABEL: define {{[^@]+}}@__tls_init
-// CHECK-TLS2-SAME: () #[[ATTR6]] {
+// CHECK-TLS2-SAME: () #[[ATTR7]] {
 // CHECK-TLS2-NEXT:  entry:
 // CHECK-TLS2-NEXT:    [[TMP0:%.*]] = load i8, i8* @__tls_guard, align 1
 // CHECK-TLS2-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0
@@ -4714,83 +4734,88 @@ int foobar() {
 // CHECK-TLS3-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP4]], i32 0, i32 0, !dbg [[DBG212:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG212]]
 // CHECK-TLS3-NEXT:    store i32 [[TMP5]], i32* [[RES]], align 4, !dbg [[DBG213:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_SMAIN:%.*]], %struct.Smain* @_ZZ4mainE2sm, i32 0, i32 0), align 8, !dbg [[DBG214:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG215:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP6]], !dbg [[DBG215]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG215]]
-// CHECK-TLS3-NEXT:    [[TMP8:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG216:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP8]], i32 0, i32 0, !dbg [[DBG217:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A2]], align 4, !dbg [[DBG217]]
-// CHECK-TLS3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG218:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], [[TMP9]], !dbg [[DBG218]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD3]], i32* [[RES]], align 4, !dbg [[DBG218]]
-// CHECK-TLS3-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG219:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG220:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG220]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG220]]
-// CHECK-TLS3-NEXT:    [[TMP13:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG221:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP13]], i32 0, i32 0, !dbg [[DBG222:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A5]], align 4, !dbg [[DBG222]]
-// CHECK-TLS3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG223:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP15]], [[TMP14]], !dbg [[DBG223]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD6]], i32* [[RES]], align 4, !dbg [[DBG223]]
-// CHECK-TLS3-NEXT:    [[TMP16:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG224:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP16]], i64 0, i64 1, !dbg [[DBG224]]
-// CHECK-TLS3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG224]]
-// CHECK-TLS3-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX7]], i32 0, i32 0, !dbg [[DBG225:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP17:%.*]] = load i32, i32* [[A8]], align 4, !dbg [[DBG225]]
-// CHECK-TLS3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG226:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[TMP17]], !dbg [[DBG226]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG226]]
-// CHECK-TLS3-NEXT:    [[TMP19:%.*]] = load i32, i32* @_ZN2STIiE2stE, align 4, !dbg [[DBG227:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP20:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG228:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP19]], !dbg [[DBG228]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4, !dbg [[DBG228]]
-// CHECK-TLS3-NEXT:    [[TMP21:%.*]] = load float, float* @_ZN2STIfE2stE, align 4, !dbg [[DBG229:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[CONV:%.*]] = fptosi float [[TMP21]] to i32, !dbg [[DBG229]]
-// CHECK-TLS3-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG230:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[CONV]], !dbg [[DBG230]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG230]]
-// CHECK-TLS3-NEXT:    [[TMP23:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG231:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP23]], i32 0, i32 0, !dbg [[DBG232:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP24:%.*]] = load i32, i32* [[A12]], align 4, !dbg [[DBG232]]
-// CHECK-TLS3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG233:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP25]], [[TMP24]], !dbg [[DBG233]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD13]], i32* [[RES]], align 4, !dbg [[DBG233]]
-// CHECK-TLS3-NEXT:    [[TMP26:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG234:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret i32 [[TMP26]], !dbg [[DBG235:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP6:%.*]] = call %struct.Smain* @llvm.threadlocal.address.p0s_struct.Smains(%struct.Smain* @_ZZ4mainE2sm), !dbg [[DBG214:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[TMP6]], i32 0, i32 0, !dbg [[DBG215:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A2]], align 8, !dbg [[DBG215]]
+// CHECK-TLS3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG216:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP7]], !dbg [[DBG216]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG216]]
+// CHECK-TLS3-NEXT:    [[TMP9:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG217:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP9]], i32 0, i32 0, !dbg [[DBG218:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG218]]
+// CHECK-TLS3-NEXT:    [[TMP11:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG219:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP10]], !dbg [[DBG219]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG219]]
+// CHECK-TLS3-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG220:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG221:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP12]], !dbg [[DBG221]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD5]], i32* [[RES]], align 4, !dbg [[DBG221]]
+// CHECK-TLS3-NEXT:    [[TMP14:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG222:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP14]], i32 0, i32 0, !dbg [[DBG223:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG223]]
+// CHECK-TLS3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG224:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], [[TMP15]], !dbg [[DBG224]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG224]]
+// CHECK-TLS3-NEXT:    [[TMP17:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG225:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP17]], i64 0, i64 1, !dbg [[DBG225]]
+// CHECK-TLS3-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG225]]
+// CHECK-TLS3-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX8]], i32 0, i32 0, !dbg [[DBG226:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A9]], align 4, !dbg [[DBG226]]
+// CHECK-TLS3-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG227:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP19]], [[TMP18]], !dbg [[DBG227]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4, !dbg [[DBG227]]
+// CHECK-TLS3-NEXT:    [[TMP20:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE), !dbg [[DBG228:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !dbg [[DBG228]]
+// CHECK-TLS3-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG229:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP21]], !dbg [[DBG229]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG229]]
+// CHECK-TLS3-NEXT:    [[TMP23:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE), !dbg [[DBG230:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !dbg [[DBG230]]
+// CHECK-TLS3-NEXT:    [[CONV:%.*]] = fptosi float [[TMP24]] to i32, !dbg [[DBG230]]
+// CHECK-TLS3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG231:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP25]], [[CONV]], !dbg [[DBG231]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD12]], i32* [[RES]], align 4, !dbg [[DBG231]]
+// CHECK-TLS3-NEXT:    [[TMP26:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG232:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A13:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP26]], i32 0, i32 0, !dbg [[DBG233:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP27:%.*]] = load i32, i32* [[A13]], align 4, !dbg [[DBG233]]
+// CHECK-TLS3-NEXT:    [[TMP28:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG234:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP28]], [[TMP27]], !dbg [[DBG234]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD14]], i32* [[RES]], align 4, !dbg [[DBG234]]
+// CHECK-TLS3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG235:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret i32 [[TMP29]], !dbg [[DBG236:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZTWL3gs1
 // CHECK-TLS3-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-TLS3-NEXT:    call void @_ZTHL3gs1()
-// CHECK-TLS3-NEXT:    ret %struct.S1* @_ZL3gs1
+// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = call %struct.S1* @llvm.threadlocal.address.p0s_struct.S1s(%struct.S1* @_ZL3gs1)
+// CHECK-TLS3-NEXT:    ret %struct.S1* [[TMP1]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC1Ei
-// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 !dbg [[DBG236:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 !dbg [[DBG237:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS3-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]]
 // CHECK-TLS3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG242:![0-9]+]]
-// CHECK-TLS3-NEXT:    call void @_ZZ4mainEN5SmainC2Ei(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG242]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG243:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG243:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @_ZZ4mainEN5SmainC2Ei(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG243]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG244:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainD1Ev
-// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG244:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG245:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS3-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @_ZZ4mainEN5SmainD2Ev(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]], !dbg [[DBG247:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG248:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @_ZZ4mainEN5SmainD2Ev(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]], !dbg [[DBG248:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZTWN6Static1sE
@@ -4800,7 +4825,8 @@ int foobar() {
 // CHECK-TLS3-NEXT:    call void @_ZTHN6Static1sE()
 // CHECK-TLS3-NEXT:    br label [[TMP2]]
 // CHECK-TLS3:       2:
-// CHECK-TLS3-NEXT:    ret %struct.S3* @_ZN6Static1sE
+// CHECK-TLS3-NEXT:    [[TMP3:%.*]] = call %struct.S3* @llvm.threadlocal.address.p0s_struct.S3s(%struct.S3* @_ZN6Static1sE)
+// CHECK-TLS3-NEXT:    ret %struct.S3* [[TMP3]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZTW3gs3
@@ -4810,188 +4836,193 @@ int foobar() {
 // CHECK-TLS3-NEXT:    call void @_ZTH3gs3()
 // CHECK-TLS3-NEXT:    br label [[TMP2]]
 // CHECK-TLS3:       2:
-// CHECK-TLS3-NEXT:    ret %struct.S5* @gs3
+// CHECK-TLS3-NEXT:    [[TMP3:%.*]] = call %struct.S5* @llvm.threadlocal.address.p0s_struct.S5s(%struct.S5* @gs3)
+// CHECK-TLS3-NEXT:    ret %struct.S5* [[TMP3]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZTW5arr_x
 // CHECK-TLS3-SAME: () #[[ATTR6]] comdat {
 // CHECK-TLS3-NEXT:    call void @_ZTH5arr_x()
-// CHECK-TLS3-NEXT:    ret [2 x [3 x %struct.S1]]* @arr_x
+// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = call [2 x [3 x %struct.S1]]* @llvm.threadlocal.address.p0a2a3s_struct.S1s([2 x [3 x %struct.S1]]* @arr_x)
+// CHECK-TLS3-NEXT:    ret [2 x [3 x %struct.S1]]* [[TMP1]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZTWN2STI2S4E2stE
 // CHECK-TLS3-SAME: () #[[ATTR6]] comdat {
 // CHECK-TLS3-NEXT:    call void @_ZTHN2STI2S4E2stE()
-// CHECK-TLS3-NEXT:    ret %struct.S4* @_ZN2STI2S4E2stE
+// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = call %struct.S4* @llvm.threadlocal.address.p0s_struct.S4s(%struct.S4* @_ZN2STI2S4E2stE)
+// CHECK-TLS3-NEXT:    ret %struct.S4* [[TMP1]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC2Ei
-// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG249:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG250:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS3-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
 // CHECK-TLS3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG254:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG255:![0-9]+]]
-// CHECK-TLS3-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG254]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG256:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG255:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG256:![0-9]+]]
+// CHECK-TLS3-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG255]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG257:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainD2Ev
-// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG257:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] align 2 !dbg [[DBG258:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS3-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG260:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG260:![0-9]+]]
-// CHECK-TLS3-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG262:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG263:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG261:![0-9]+]]
+// CHECK-TLS3-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG263:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG264:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_Z6foobarv
-// CHECK-TLS3-SAME: () #[[ATTR7:[0-9]+]] !dbg [[DBG264:![0-9]+]] {
+// CHECK-TLS3-SAME: () #[[ATTR7:[0-9]+]] !dbg [[DBG265:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[RES:%.*]] = alloca i32, align 4
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[RES]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG266:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE(), !dbg [[DBG267:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP0]], i32 0, i32 0, !dbg [[DBG268:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG268]]
-// CHECK-TLS3-NEXT:    store i32 [[TMP1]], i32* [[RES]], align 4, !dbg [[DBG269:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP2:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG270:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP2]], i32 0, i32 0, !dbg [[DBG271:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG271]]
-// CHECK-TLS3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG272:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP3]], !dbg [[DBG272]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG272]]
-// CHECK-TLS3-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG273:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG274:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], [[TMP5]], !dbg [[DBG274]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD2]], i32* [[RES]], align 4, !dbg [[DBG274]]
-// CHECK-TLS3-NEXT:    [[TMP7:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG275:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP7]], i32 0, i32 0, !dbg [[DBG276:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG276]]
-// CHECK-TLS3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG277:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP8]], !dbg [[DBG277]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG277]]
-// CHECK-TLS3-NEXT:    [[TMP10:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG278:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP10]], i64 0, i64 1, !dbg [[DBG278]]
-// CHECK-TLS3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG278]]
-// CHECK-TLS3-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX5]], i32 0, i32 0, !dbg [[DBG279:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP11:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG279]]
-// CHECK-TLS3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG280:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG280]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG280]]
-// CHECK-TLS3-NEXT:    [[TMP13:%.*]] = load i32, i32* @_ZN2STIiE2stE, align 4, !dbg [[DBG281:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP14:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG282:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP14]], [[TMP13]], !dbg [[DBG282]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD8]], i32* [[RES]], align 4, !dbg [[DBG282]]
-// CHECK-TLS3-NEXT:    [[TMP15:%.*]] = load float, float* @_ZN2STIfE2stE, align 4, !dbg [[DBG283:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[CONV:%.*]] = fptosi float [[TMP15]] to i32, !dbg [[DBG283]]
-// CHECK-TLS3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG284:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP16]], [[CONV]], !dbg [[DBG284]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG284]]
-// CHECK-TLS3-NEXT:    [[TMP17:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG285:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP17]], i32 0, i32 0, !dbg [[DBG286:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A10]], align 4, !dbg [[DBG286]]
-// CHECK-TLS3-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG287:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[TMP18]], !dbg [[DBG287]]
-// CHECK-TLS3-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG287]]
-// CHECK-TLS3-NEXT:    [[TMP20:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG288:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret i32 [[TMP20]], !dbg [[DBG289:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[RES]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG267:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE(), !dbg [[DBG268:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP0]], i32 0, i32 0, !dbg [[DBG269:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG269]]
+// CHECK-TLS3-NEXT:    store i32 [[TMP1]], i32* [[RES]], align 4, !dbg [[DBG270:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP2:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG271:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP2]], i32 0, i32 0, !dbg [[DBG272:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG272]]
+// CHECK-TLS3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG273:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP3]], !dbg [[DBG273]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG273]]
+// CHECK-TLS3-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG274:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG275:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], [[TMP5]], !dbg [[DBG275]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD2]], i32* [[RES]], align 4, !dbg [[DBG275]]
+// CHECK-TLS3-NEXT:    [[TMP7:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG276:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP7]], i32 0, i32 0, !dbg [[DBG277:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG277]]
+// CHECK-TLS3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG278:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP8]], !dbg [[DBG278]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG278]]
+// CHECK-TLS3-NEXT:    [[TMP10:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG279:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP10]], i64 0, i64 1, !dbg [[DBG279]]
+// CHECK-TLS3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG279]]
+// CHECK-TLS3-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX5]], i32 0, i32 0, !dbg [[DBG280:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP11:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG280]]
+// CHECK-TLS3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG281:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG281]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG281]]
+// CHECK-TLS3-NEXT:    [[TMP13:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE), !dbg [[DBG282:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !dbg [[DBG282]]
+// CHECK-TLS3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG283:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], [[TMP14]], !dbg [[DBG283]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD8]], i32* [[RES]], align 4, !dbg [[DBG283]]
+// CHECK-TLS3-NEXT:    [[TMP16:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE), !dbg [[DBG284:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !dbg [[DBG284]]
+// CHECK-TLS3-NEXT:    [[CONV:%.*]] = fptosi float [[TMP17]] to i32, !dbg [[DBG284]]
+// CHECK-TLS3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG285:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[CONV]], !dbg [[DBG285]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG285]]
+// CHECK-TLS3-NEXT:    [[TMP19:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG286:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP19]], i32 0, i32 0, !dbg [[DBG287:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP20:%.*]] = load i32, i32* [[A10]], align 4, !dbg [[DBG287]]
+// CHECK-TLS3-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG288:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP21]], [[TMP20]], !dbg [[DBG288]]
+// CHECK-TLS3-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG288]]
+// CHECK-TLS3-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG289:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret i32 [[TMP22]], !dbg [[DBG290:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
-// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG290:![0-9]+]] {
+// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG291:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i8, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG291:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG291]]
-// CHECK-TLS3-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT_CHECK:%.*]], label [[INIT_END:%.*]], !dbg [[DBG291]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i8, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG292:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG292]]
+// CHECK-TLS3-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT_CHECK:%.*]], label [[INIT_END:%.*]], !dbg [[DBG292]]
 // CHECK-TLS3:       init.check:
-// CHECK-TLS3-NEXT:    call void @_ZN2S4C1Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) @_ZN2STI2S4E2stE, i32 noundef 23), !dbg [[DBG292:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S4*)* @_ZN2S4D1Ev to void (i8*)*), i8* bitcast (%struct.S4* @_ZN2STI2S4E2stE to i8*), i8* @__dso_handle) #[[ATTR3]], !dbg [[DBG291]]
-// CHECK-TLS3-NEXT:    store i8 1, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG291]]
-// CHECK-TLS3-NEXT:    br label [[INIT_END]], !dbg [[DBG291]]
+// CHECK-TLS3-NEXT:    call void @_ZN2S4C1Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) @_ZN2STI2S4E2stE, i32 noundef 23), !dbg [[DBG293:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP1:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S4*)* @_ZN2S4D1Ev to void (i8*)*), i8* bitcast (%struct.S4* @_ZN2STI2S4E2stE to i8*), i8* @__dso_handle) #[[ATTR3]], !dbg [[DBG292]]
+// CHECK-TLS3-NEXT:    store i8 1, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG292]]
+// CHECK-TLS3-NEXT:    br label [[INIT_END]], !dbg [[DBG292]]
 // CHECK-TLS3:       init.end:
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG294:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG295:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZN2S4C1Ei
-// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 !dbg [[DBG295:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 !dbg [[DBG296:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS3-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG299:![0-9]+]]
 // CHECK-TLS3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META300:![0-9]+]], metadata !DIExpression()), !dbg [[DBG301:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG301:![0-9]+]]
-// CHECK-TLS3-NEXT:    call void @_ZN2S4C2Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG301]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG302:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG302:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @_ZN2S4C2Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG302]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG303:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZN2S4D1Ev
-// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG303:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG304:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS3-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META305:![0-9]+]], metadata !DIExpression()), !dbg [[DBG306:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @_ZN2S4D2Ev(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR3]], !dbg [[DBG306:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG307:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @_ZN2S4D2Ev(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR3]], !dbg [[DBG307:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG308:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZN2S4C2Ei
-// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG308:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG309:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS3-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META309:![0-9]+]], metadata !DIExpression()), !dbg [[DBG310:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311:![0-9]+]]
 // CHECK-TLS3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG312:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG313:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG313:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG314:![0-9]+]]
-// CHECK-TLS3-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG313]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG314:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG315:![0-9]+]]
+// CHECK-TLS3-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG314]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG316:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_ZN2S4D2Ev
-// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG316:![0-9]+]] {
+// CHECK-TLS3-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 !dbg [[DBG317:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
 // CHECK-TLS3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS3-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META317:![0-9]+]], metadata !DIExpression()), !dbg [[DBG318:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META318:![0-9]+]], metadata !DIExpression()), !dbg [[DBG319:![0-9]+]]
 // CHECK-TLS3-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG319:![0-9]+]]
-// CHECK-TLS3-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG321:![0-9]+]]
-// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG322:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG320:![0-9]+]]
+// CHECK-TLS3-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG322:![0-9]+]]
+// CHECK-TLS3-NEXT:    ret void, !dbg [[DBG323:![0-9]+]]
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_threadprivate_codegen.cpp
-// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG323:![0-9]+]] {
+// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG324:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
-// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init.1(), !dbg [[DBG325:![0-9]+]]
+// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init.1(), !dbg [[DBG326:![0-9]+]]
 // CHECK-TLS3-NEXT:    ret void
 //
 //
 // CHECK-TLS3-LABEL: define {{[^@]+}}@__tls_init
-// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG326:![0-9]+]] {
+// CHECK-TLS3-SAME: () #[[ATTR0]] !dbg [[DBG327:![0-9]+]] {
 // CHECK-TLS3-NEXT:  entry:
-// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i8, i8* @__tls_guard, align 1, !dbg [[DBG327:![0-9]+]]
-// CHECK-TLS3-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG327]]
-// CHECK-TLS3-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT:%.*]], label [[EXIT:%.*]], !dbg [[DBG327]], !prof [[PROF207]]
+// CHECK-TLS3-NEXT:    [[TMP0:%.*]] = load i8, i8* @__tls_guard, align 1, !dbg [[DBG328:![0-9]+]]
+// CHECK-TLS3-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG328]]
+// CHECK-TLS3-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT:%.*]], label [[EXIT:%.*]], !dbg [[DBG328]], !prof [[PROF207]]
 // CHECK-TLS3:       init:
-// CHECK-TLS3-NEXT:    store i8 1, i8* @__tls_guard, align 1, !dbg [[DBG327]]
-// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init(), !dbg [[DBG327]]
-// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init.2(), !dbg [[DBG327]]
-// CHECK-TLS3-NEXT:    br label [[EXIT]], !dbg [[DBG327]]
+// CHECK-TLS3-NEXT:    store i8 1, i8* @__tls_guard, align 1, !dbg [[DBG328]]
+// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init(), !dbg [[DBG328]]
+// CHECK-TLS3-NEXT:    call void @__cxx_global_var_init.2(), !dbg [[DBG328]]
+// CHECK-TLS3-NEXT:    br label [[EXIT]], !dbg [[DBG328]]
 // CHECK-TLS3:       exit:
 // CHECK-TLS3-NEXT:    ret void
 //
@@ -5019,85 +5050,88 @@ int foobar() {
 // CHECK-TLS4-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP4]], i32 0, i32 0, !dbg [[DBG124:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG124]]
 // CHECK-TLS4-NEXT:    store i32 [[TMP5]], i32* [[RES]], align 4, !dbg [[DBG125:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_SMAIN:%.*]], %struct.Smain* @_ZZ4mainE2sm, i32 0, i32 0), align 8, !dbg [[DBG126:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG127:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP6]], !dbg [[DBG127]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG127]]
-// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG128:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP8]], i32 0, i32 0, !dbg [[DBG129:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A2]], align 4, !dbg [[DBG129]]
-// CHECK-TLS4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG130:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], [[TMP9]], !dbg [[DBG130]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD3]], i32* [[RES]], align 4, !dbg [[DBG130]]
-// CHECK-TLS4-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG131:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG132:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG132]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG132]]
-// CHECK-TLS4-NEXT:    [[TMP13:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG133:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP13]], i32 0, i32 0, !dbg [[DBG134:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A5]], align 4, !dbg [[DBG134]]
-// CHECK-TLS4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG135:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP15]], [[TMP14]], !dbg [[DBG135]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD6]], i32* [[RES]], align 4, !dbg [[DBG135]]
-// CHECK-TLS4-NEXT:    [[TMP16:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG136:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP16]], i64 0, i64 1, !dbg [[DBG136]]
-// CHECK-TLS4-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG136]]
-// CHECK-TLS4-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX7]], i32 0, i32 0, !dbg [[DBG137:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP17:%.*]] = load i32, i32* [[A8]], align 4, !dbg [[DBG137]]
-// CHECK-TLS4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG138:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[TMP17]], !dbg [[DBG138]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG138]]
-// CHECK-TLS4-NEXT:    [[TMP19:%.*]] = call i32* @_ZTWN2STIiE2stE(), !dbg [[DBG139:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !dbg [[DBG139]]
-// CHECK-TLS4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG140:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], [[TMP20]], !dbg [[DBG140]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4, !dbg [[DBG140]]
-// CHECK-TLS4-NEXT:    [[TMP22:%.*]] = call float* @_ZTWN2STIfE2stE(), !dbg [[DBG141:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !dbg [[DBG141]]
-// CHECK-TLS4-NEXT:    [[CONV:%.*]] = fptosi float [[TMP23]] to i32, !dbg [[DBG141]]
-// CHECK-TLS4-NEXT:    [[TMP24:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG142:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[CONV]], !dbg [[DBG142]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG142]]
-// CHECK-TLS4-NEXT:    [[TMP25:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG143:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP25]], i32 0, i32 0, !dbg [[DBG144:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP26:%.*]] = load i32, i32* [[A12]], align 4, !dbg [[DBG144]]
-// CHECK-TLS4-NEXT:    [[TMP27:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG145:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP27]], [[TMP26]], !dbg [[DBG145]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD13]], i32* [[RES]], align 4, !dbg [[DBG145]]
+// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = call %struct.Smain* @llvm.threadlocal.address.p0s_struct.Smains(%struct.Smain* @_ZZ4mainE2sm), !dbg [[DBG126:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[TMP6]], i32 0, i32 0, !dbg [[DBG127:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A2]], align 8, !dbg [[DBG127]]
+// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG128:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP7]], !dbg [[DBG128]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG128]]
+// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG129:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP9]], i32 0, i32 0, !dbg [[DBG130:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG130]]
+// CHECK-TLS4-NEXT:    [[TMP11:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG131:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP10]], !dbg [[DBG131]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG131]]
+// CHECK-TLS4-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG132:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP13:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG133:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP12]], !dbg [[DBG133]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD5]], i32* [[RES]], align 4, !dbg [[DBG133]]
+// CHECK-TLS4-NEXT:    [[TMP14:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG134:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP14]], i32 0, i32 0, !dbg [[DBG135:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG135]]
+// CHECK-TLS4-NEXT:    [[TMP16:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG136:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], [[TMP15]], !dbg [[DBG136]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG136]]
+// CHECK-TLS4-NEXT:    [[TMP17:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG137:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP17]], i64 0, i64 1, !dbg [[DBG137]]
+// CHECK-TLS4-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG137]]
+// CHECK-TLS4-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX8]], i32 0, i32 0, !dbg [[DBG138:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A9]], align 4, !dbg [[DBG138]]
+// CHECK-TLS4-NEXT:    [[TMP19:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG139:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP19]], [[TMP18]], !dbg [[DBG139]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD10]], i32* [[RES]], align 4, !dbg [[DBG139]]
+// CHECK-TLS4-NEXT:    [[TMP20:%.*]] = call i32* @_ZTWN2STIiE2stE(), !dbg [[DBG140:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !dbg [[DBG140]]
+// CHECK-TLS4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG141:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP21]], !dbg [[DBG141]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG141]]
+// CHECK-TLS4-NEXT:    [[TMP23:%.*]] = call float* @_ZTWN2STIfE2stE(), !dbg [[DBG142:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !dbg [[DBG142]]
+// CHECK-TLS4-NEXT:    [[CONV:%.*]] = fptosi float [[TMP24]] to i32, !dbg [[DBG142]]
+// CHECK-TLS4-NEXT:    [[TMP25:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG143:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP25]], [[CONV]], !dbg [[DBG143]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD12]], i32* [[RES]], align 4, !dbg [[DBG143]]
+// CHECK-TLS4-NEXT:    [[TMP26:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG144:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A13:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP26]], i32 0, i32 0, !dbg [[DBG145:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP27:%.*]] = load i32, i32* [[A13]], align 4, !dbg [[DBG145]]
 // CHECK-TLS4-NEXT:    [[TMP28:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG146:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret i32 [[TMP28]], !dbg [[DBG147:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP28]], [[TMP27]], !dbg [[DBG146]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD14]], i32* [[RES]], align 4, !dbg [[DBG146]]
+// CHECK-TLS4-NEXT:    [[TMP29:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG147:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret i32 [[TMP29]], !dbg [[DBG148:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTWL3gs1
 // CHECK-TLS4-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK-TLS4-NEXT:    call void @_ZTHL3gs1()
-// CHECK-TLS4-NEXT:    ret %struct.S1* @_ZL3gs1
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call %struct.S1* @llvm.threadlocal.address.p0s_struct.S1s(%struct.S1* @_ZL3gs1)
+// CHECK-TLS4-NEXT:    ret %struct.S1* [[TMP1]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC1Ei
-// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] align 2 !dbg [[DBG148:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] align 2 !dbg [[DBG149:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG154:![0-9]+]]
-// CHECK-TLS4-NEXT:    call void @_ZZ4mainEN5SmainC2Ei(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG154]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG155:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG155:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZZ4mainEN5SmainC2Ei(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG155]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG156:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainD1Ev
-// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4:[0-9]+]] align 2 !dbg [[DBG156:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4:[0-9]+]] align 2 !dbg [[DBG157:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS4-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META158:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @_ZZ4mainEN5SmainD2Ev(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]], !dbg [[DBG159:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG160:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZZ4mainEN5SmainD2Ev(%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]], !dbg [[DBG160:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG161:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTWN6Static1sE
@@ -5107,7 +5141,8 @@ int foobar() {
 // CHECK-TLS4-NEXT:    call void @_ZTHN6Static1sE()
 // CHECK-TLS4-NEXT:    br label [[TMP2]]
 // CHECK-TLS4:       2:
-// CHECK-TLS4-NEXT:    ret %struct.S3* @_ZN6Static1sE
+// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = call %struct.S3* @llvm.threadlocal.address.p0s_struct.S3s(%struct.S3* @_ZN6Static1sE)
+// CHECK-TLS4-NEXT:    ret %struct.S3* [[TMP3]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTW3gs3
@@ -5117,434 +5152,439 @@ int foobar() {
 // CHECK-TLS4-NEXT:    call void @_ZTH3gs3()
 // CHECK-TLS4-NEXT:    br label [[TMP2]]
 // CHECK-TLS4:       2:
-// CHECK-TLS4-NEXT:    ret %struct.S5* @gs3
+// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = call %struct.S5* @llvm.threadlocal.address.p0s_struct.S5s(%struct.S5* @gs3)
+// CHECK-TLS4-NEXT:    ret %struct.S5* [[TMP3]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTW5arr_x
 // CHECK-TLS4-SAME: () #[[ATTR2]] comdat {
 // CHECK-TLS4-NEXT:    call void @_ZTH5arr_x()
-// CHECK-TLS4-NEXT:    ret [2 x [3 x %struct.S1]]* @arr_x
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call [2 x [3 x %struct.S1]]* @llvm.threadlocal.address.p0a2a3s_struct.S1s([2 x [3 x %struct.S1]]* @arr_x)
+// CHECK-TLS4-NEXT:    ret [2 x [3 x %struct.S1]]* [[TMP1]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTWN2STIiE2stE
 // CHECK-TLS4-SAME: () #[[ATTR2]] comdat {
-// CHECK-TLS4-NEXT:    ret i32* @_ZN2STIiE2stE
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call i32* @llvm.threadlocal.address.p0i32(i32* @_ZN2STIiE2stE)
+// CHECK-TLS4-NEXT:    ret i32* [[TMP1]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTWN2STIfE2stE
 // CHECK-TLS4-SAME: () #[[ATTR2]] comdat {
-// CHECK-TLS4-NEXT:    ret float* @_ZN2STIfE2stE
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call float* @llvm.threadlocal.address.p0f32(float* @_ZN2STIfE2stE)
+// CHECK-TLS4-NEXT:    ret float* [[TMP1]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZTWN2STI2S4E2stE
 // CHECK-TLS4-SAME: () #[[ATTR2]] comdat {
 // CHECK-TLS4-NEXT:    call void @_ZTHN2STI2S4E2stE()
-// CHECK-TLS4-NEXT:    ret %struct.S4* @_ZN2STI2S4E2stE
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call %struct.S4* @llvm.threadlocal.address.p0s_struct.S4s(%struct.S4* @_ZN2STI2S4E2stE)
+// CHECK-TLS4-NEXT:    ret %struct.S4* [[TMP1]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_Z6foobarv
-// CHECK-TLS4-SAME: () #[[ATTR6:[0-9]+]] !dbg [[DBG161:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR6:[0-9]+]] !dbg [[DBG162:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[RES:%.*]] = alloca i32, align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[RES]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE(), !dbg [[DBG164:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP0]], i32 0, i32 0, !dbg [[DBG165:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG165]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP1]], i32* [[RES]], align 4, !dbg [[DBG166:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP2:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG167:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP2]], i32 0, i32 0, !dbg [[DBG168:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG168]]
-// CHECK-TLS4-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG169:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP3]], !dbg [[DBG169]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG169]]
-// CHECK-TLS4-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG170:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG171:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], [[TMP5]], !dbg [[DBG171]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD2]], i32* [[RES]], align 4, !dbg [[DBG171]]
-// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG172:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP7]], i32 0, i32 0, !dbg [[DBG173:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG173]]
-// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG174:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP8]], !dbg [[DBG174]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG174]]
-// CHECK-TLS4-NEXT:    [[TMP10:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG175:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP10]], i64 0, i64 1, !dbg [[DBG175]]
-// CHECK-TLS4-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG175]]
-// CHECK-TLS4-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX5]], i32 0, i32 0, !dbg [[DBG176:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP11:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG176]]
-// CHECK-TLS4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG177:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG177]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG177]]
-// CHECK-TLS4-NEXT:    [[TMP13:%.*]] = call i32* @_ZTWN2STIiE2stE(), !dbg [[DBG178:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !dbg [[DBG178]]
-// CHECK-TLS4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG179:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], [[TMP14]], !dbg [[DBG179]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD8]], i32* [[RES]], align 4, !dbg [[DBG179]]
-// CHECK-TLS4-NEXT:    [[TMP16:%.*]] = call float* @_ZTWN2STIfE2stE(), !dbg [[DBG180:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !dbg [[DBG180]]
-// CHECK-TLS4-NEXT:    [[CONV:%.*]] = fptosi float [[TMP17]] to i32, !dbg [[DBG180]]
-// CHECK-TLS4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG181:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[CONV]], !dbg [[DBG181]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG181]]
-// CHECK-TLS4-NEXT:    [[TMP19:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG182:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP19]], i32 0, i32 0, !dbg [[DBG183:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[A10]], align 4, !dbg [[DBG183]]
-// CHECK-TLS4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG184:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP21]], [[TMP20]], !dbg [[DBG184]]
-// CHECK-TLS4-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG184]]
-// CHECK-TLS4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG185:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret i32 [[TMP22]], !dbg [[DBG186:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[RES]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call %struct.S3* @_ZTWN6Static1sE(), !dbg [[DBG165:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S3:%.*]], %struct.S3* [[TMP0]], i32 0, i32 0, !dbg [[DBG166:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG166]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP1]], i32* [[RES]], align 4, !dbg [[DBG167:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP2:%.*]] = call %struct.S1* @_ZTWL3gs1(), !dbg [[DBG168:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A1:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP2]], i32 0, i32 0, !dbg [[DBG169:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A1]], align 4, !dbg [[DBG169]]
+// CHECK-TLS4-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG170:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP3]], !dbg [[DBG170]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD]], i32* [[RES]], align 4, !dbg [[DBG170]]
+// CHECK-TLS4-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_S2:%.*]], %struct.S2* @_ZL3gs2, i32 0, i32 0), align 8, !dbg [[DBG171:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG172:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], [[TMP5]], !dbg [[DBG172]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD2]], i32* [[RES]], align 4, !dbg [[DBG172]]
+// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = call %struct.S5* @_ZTW3gs3(), !dbg [[DBG173:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S5:%.*]], %struct.S5* [[TMP7]], i32 0, i32 0, !dbg [[DBG174:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A3]], align 4, !dbg [[DBG174]]
+// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG175:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP8]], !dbg [[DBG175]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD4]], i32* [[RES]], align 4, !dbg [[DBG175]]
+// CHECK-TLS4-NEXT:    [[TMP10:%.*]] = call [2 x [3 x %struct.S1]]* @_ZTW5arr_x(), !dbg [[DBG176:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* [[TMP10]], i64 0, i64 1, !dbg [[DBG176]]
+// CHECK-TLS4-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG176]]
+// CHECK-TLS4-NEXT:    [[A6:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYIDX5]], i32 0, i32 0, !dbg [[DBG177:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP11:%.*]] = load i32, i32* [[A6]], align 4, !dbg [[DBG177]]
+// CHECK-TLS4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG178:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP12]], [[TMP11]], !dbg [[DBG178]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD7]], i32* [[RES]], align 4, !dbg [[DBG178]]
+// CHECK-TLS4-NEXT:    [[TMP13:%.*]] = call i32* @_ZTWN2STIiE2stE(), !dbg [[DBG179:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !dbg [[DBG179]]
+// CHECK-TLS4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG180:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], [[TMP14]], !dbg [[DBG180]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD8]], i32* [[RES]], align 4, !dbg [[DBG180]]
+// CHECK-TLS4-NEXT:    [[TMP16:%.*]] = call float* @_ZTWN2STIfE2stE(), !dbg [[DBG181:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !dbg [[DBG181]]
+// CHECK-TLS4-NEXT:    [[CONV:%.*]] = fptosi float [[TMP17]] to i32, !dbg [[DBG181]]
+// CHECK-TLS4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG182:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP18]], [[CONV]], !dbg [[DBG182]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD9]], i32* [[RES]], align 4, !dbg [[DBG182]]
+// CHECK-TLS4-NEXT:    [[TMP19:%.*]] = call %struct.S4* @_ZTWN2STI2S4E2stE(), !dbg [[DBG183:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[TMP19]], i32 0, i32 0, !dbg [[DBG184:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[A10]], align 4, !dbg [[DBG184]]
+// CHECK-TLS4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG185:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP21]], [[TMP20]], !dbg [[DBG185]]
+// CHECK-TLS4-NEXT:    store i32 [[ADD11]], i32* [[RES]], align 4, !dbg [[DBG185]]
+// CHECK-TLS4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[RES]], align 4, !dbg [[DBG186:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret i32 [[TMP22]], !dbg [[DBG187:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK-TLS4-SAME: () #[[ATTR7:[0-9]+]] !dbg [[DBG187:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7:[0-9]+]] !dbg [[DBG188:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
-// CHECK-TLS4-NEXT:    call void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) @_ZL3gs1, i32 noundef 5), !dbg [[DBG191:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S1*)* @_ZN2S1D1Ev to void (i8*)*), i8* bitcast (%struct.S1* @_ZL3gs1 to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG193:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG194:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) @_ZL3gs1, i32 noundef 5), !dbg [[DBG192:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S1*)* @_ZN2S1D1Ev to void (i8*)*), i8* bitcast (%struct.S1* @_ZL3gs1 to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG194:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG195:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S1C1Ei
-// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG195:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG196:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG198:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG200:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG201:![0-9]+]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S1C2Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG201]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG202:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1C2Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG202]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG203:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S1D1Ev
-// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG203:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG204:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @_ZN2S1D2Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]], !dbg [[DBG206:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG207:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1D2Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]], !dbg [[DBG207:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG208:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S1C2Ei
-// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG208:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG209:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG212:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG213:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0, !dbg [[DBG213:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG214:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG213]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG215:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0, !dbg [[DBG214:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG215:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG214]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG216:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S1D2Ev
-// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG216:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG217:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S1** [[THIS_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG219:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0, !dbg [[DBG219:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG221:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG222:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0, !dbg [[DBG220:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG222:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG223:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
-// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG223:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG224:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
-// CHECK-TLS4-NEXT:    call void @_ZN2S2C1Ei(%struct.S2* noundef nonnull align 8 dereferenceable(16) @_ZL3gs2, i32 noundef 27), !dbg [[DBG224:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.S2*)* @_ZN2S2D1Ev to void (i8*)*), i8* bitcast (%struct.S2* @_ZL3gs2 to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG226:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG227:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S2C1Ei(%struct.S2* noundef nonnull align 8 dereferenceable(16) @_ZL3gs2, i32 noundef 27), !dbg [[DBG225:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.S2*)* @_ZN2S2D1Ev to void (i8*)*), i8* bitcast (%struct.S2* @_ZL3gs2 to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG227:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG228:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S2C1Ei
-// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG228:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG229:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S2*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S2* [[THIS]], %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG232:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META233:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S2*, %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG234:![0-9]+]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S2C2Ei(%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG234]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG235:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG235:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S2C2Ei(%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG235]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG236:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S2D1Ev
-// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG236:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG237:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S2*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S2* [[THIS]], %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S2*, %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @_ZN2S2D2Ev(%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS1]]) #[[ATTR5]], !dbg [[DBG239:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG240:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S2D2Ev(%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS1]]) #[[ATTR5]], !dbg [[DBG240:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG241:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S2C2Ei
-// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG241:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG242:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S2*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S2* [[THIS]], %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S2*, %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], %struct.S2* [[THIS1]], i32 0, i32 0, !dbg [[DBG246:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG247:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG246]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG248:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], %struct.S2* [[THIS1]], i32 0, i32 0, !dbg [[DBG247:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG248:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG247]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S2D2Ev
-// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG249:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S2* noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG250:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S2*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S2* [[THIS]], %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S2** [[THIS_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S2*, %struct.S2** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], %struct.S2* [[THIS1]], i32 0, i32 0, !dbg [[DBG252:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG254:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG255:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], %struct.S2* [[THIS1]], i32 0, i32 0, !dbg [[DBG253:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG255:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG256:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
-// CHECK-TLS4-SAME: () #[[ATTR7]] personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg [[DBG256:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7]] personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg [[DBG257:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca [3 x %struct.S1]*, align 8
 // CHECK-TLS4-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca %struct.S1*, align 8
 // CHECK-TLS4-NEXT:    [[EXN_SLOT:%.*]] = alloca i8*, align 8
 // CHECK-TLS4-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    [[ARRAYINIT_ENDOFINIT5:%.*]] = alloca %struct.S1*, align 8
-// CHECK-TLS4-NEXT:    store [3 x %struct.S1]* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0), [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG257:![0-9]+]]
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG259:![0-9]+]]
+// CHECK-TLS4-NEXT:    store [3 x %struct.S1]* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0), [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG258:![0-9]+]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG260:![0-9]+]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), i32 noundef 1)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]], !dbg [[DBG260:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]], !dbg [[DBG261:![0-9]+]]
 // CHECK-TLS4:       invoke.cont:
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 1), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG259]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 1), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG260]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 1), i32 noundef 2)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]], !dbg [[DBG261:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]], !dbg [[DBG262:![0-9]+]]
 // CHECK-TLS4:       invoke.cont2:
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 2), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG259]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 2), %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG260]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 2), i32 noundef 3)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]], !dbg [[DBG262:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]], !dbg [[DBG263:![0-9]+]]
 // CHECK-TLS4:       invoke.cont3:
-// CHECK-TLS4-NEXT:    store [3 x %struct.S1]* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1), [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG263:![0-9]+]]
+// CHECK-TLS4-NEXT:    store [3 x %struct.S1]* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1), [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG264:![0-9]+]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), i32 noundef 4)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD6:%.*]], !dbg [[DBG264:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD6:%.*]], !dbg [[DBG265:![0-9]+]]
 // CHECK-TLS4:       invoke.cont7:
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 1), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG263]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 1), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG264]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 1), i32 noundef 5)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT8:%.*]] unwind label [[LPAD6]], !dbg [[DBG265:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT8:%.*]] unwind label [[LPAD6]], !dbg [[DBG266:![0-9]+]]
 // CHECK-TLS4:       invoke.cont8:
-// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 2), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG263]]
+// CHECK-TLS4-NEXT:    store %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 2), %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG264]]
 // CHECK-TLS4-NEXT:    invoke void @_ZN2S1C1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 2), i32 noundef 6)
-// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD6]], !dbg [[DBG266:![0-9]+]]
+// CHECK-TLS4-NEXT:    to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD6]], !dbg [[DBG267:![0-9]+]]
 // CHECK-TLS4:       invoke.cont9:
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* @__cxx_global_array_dtor, i8* null, i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG267:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG267]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* @__cxx_global_array_dtor, i8* null, i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG268:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG268]]
 // CHECK-TLS4:       lpad:
 // CHECK-TLS4-NEXT:    [[TMP1:%.*]] = landingpad { i8*, i32 }
-// CHECK-TLS4-NEXT:    cleanup, !dbg [[DBG268:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP2:%.*]] = extractvalue { i8*, i32 } [[TMP1]], 0, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    store i8* [[TMP2]], i8** [[EXN_SLOT]], align 8, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = extractvalue { i8*, i32 } [[TMP1]], 1, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP3]], i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    [[TMP4:%.*]] = load %struct.S1*, %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), [[TMP4]], !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG259]]
+// CHECK-TLS4-NEXT:    cleanup, !dbg [[DBG269:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP2:%.*]] = extractvalue { i8*, i32 } [[TMP1]], 0, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    store i8* [[TMP2]], i8** [[EXN_SLOT]], align 8, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    [[TMP3:%.*]] = extractvalue { i8*, i32 } [[TMP1]], 1, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP3]], i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    [[TMP4:%.*]] = load %struct.S1*, %struct.S1** [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), [[TMP4]], !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG260]]
 // CHECK-TLS4:       arraydestroy.body:
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S1* [ [[TMP4]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]], !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), !dbg [[DBG259]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG259]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S1* [ [[TMP4]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]], !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), !dbg [[DBG260]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG260]]
 // CHECK-TLS4:       arraydestroy.done4:
-// CHECK-TLS4-NEXT:    br label [[EHCLEANUP:%.*]], !dbg [[DBG259]]
+// CHECK-TLS4-NEXT:    br label [[EHCLEANUP:%.*]], !dbg [[DBG260]]
 // CHECK-TLS4:       lpad6:
 // CHECK-TLS4-NEXT:    [[TMP5:%.*]] = landingpad { i8*, i32 }
-// CHECK-TLS4-NEXT:    cleanup, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = extractvalue { i8*, i32 } [[TMP5]], 0, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    store i8* [[TMP6]], i8** [[EXN_SLOT]], align 8, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = extractvalue { i8*, i32 } [[TMP5]], 1, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP7]], i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG268]]
-// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = load %struct.S1*, %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY10:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), [[TMP8]], !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY10]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY11:%.*]], !dbg [[DBG263]]
+// CHECK-TLS4-NEXT:    cleanup, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    [[TMP6:%.*]] = extractvalue { i8*, i32 } [[TMP5]], 0, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    store i8* [[TMP6]], i8** [[EXN_SLOT]], align 8, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    [[TMP7:%.*]] = extractvalue { i8*, i32 } [[TMP5]], 1, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP7]], i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG269]]
+// CHECK-TLS4-NEXT:    [[TMP8:%.*]] = load %struct.S1*, %struct.S1** [[ARRAYINIT_ENDOFINIT5]], align 8, !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY10:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), [[TMP8]], !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY10]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY11:%.*]], !dbg [[DBG264]]
 // CHECK-TLS4:       arraydestroy.body11:
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST12:%.*]] = phi %struct.S1* [ [[TMP8]], [[LPAD6]] ], [ [[ARRAYDESTROY_ELEMENT13:%.*]], [[ARRAYDESTROY_BODY11]] ], !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT13]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST12]], i64 -1, !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT13]]) #[[ATTR5]], !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE14:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT13]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), !dbg [[DBG263]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE14]], label [[ARRAYDESTROY_DONE15]], label [[ARRAYDESTROY_BODY11]], !dbg [[DBG263]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST12:%.*]] = phi %struct.S1* [ [[TMP8]], [[LPAD6]] ], [ [[ARRAYDESTROY_ELEMENT13:%.*]], [[ARRAYDESTROY_BODY11]] ], !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT13]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST12]], i64 -1, !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT13]]) #[[ATTR5]], !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE14:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT13]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 1, i64 0), !dbg [[DBG264]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE14]], label [[ARRAYDESTROY_DONE15]], label [[ARRAYDESTROY_BODY11]], !dbg [[DBG264]]
 // CHECK-TLS4:       arraydestroy.done15:
-// CHECK-TLS4-NEXT:    br label [[EHCLEANUP]], !dbg [[DBG263]]
+// CHECK-TLS4-NEXT:    br label [[EHCLEANUP]], !dbg [[DBG264]]
 // CHECK-TLS4:       ehcleanup:
-// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = load [3 x %struct.S1]*, [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[TMP9]], i64 0, i64 0, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), [[PAD_ARRAYEND]], !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]], !dbg [[DBG257]]
+// CHECK-TLS4-NEXT:    [[TMP9:%.*]] = load [3 x %struct.S1]*, [3 x %struct.S1]** [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], [3 x %struct.S1]* [[TMP9]], i64 0, i64 0, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq %struct.S1* getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), [[PAD_ARRAYEND]], !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]], !dbg [[DBG258]]
 // CHECK-TLS4:       arraydestroy.body17:
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi %struct.S1* [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ], !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR5]], !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT19]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]], !dbg [[DBG257]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi %struct.S1* [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ], !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR5]], !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT19]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 0, i64 0, i64 0), !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]], !dbg [[DBG258]]
 // CHECK-TLS4:       arraydestroy.done21:
-// CHECK-TLS4-NEXT:    br label [[EH_RESUME:%.*]], !dbg [[DBG257]]
+// CHECK-TLS4-NEXT:    br label [[EH_RESUME:%.*]], !dbg [[DBG258]]
 // CHECK-TLS4:       eh.resume:
-// CHECK-TLS4-NEXT:    [[EXN:%.*]] = load i8*, i8** [[EXN_SLOT]], align 8, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[SEL:%.*]] = load i32, i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { i8*, i32 } undef, i8* [[EXN]], 0, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    [[LPAD_VAL22:%.*]] = insertvalue { i8*, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG257]]
-// CHECK-TLS4-NEXT:    resume { i8*, i32 } [[LPAD_VAL22]], !dbg [[DBG257]]
+// CHECK-TLS4-NEXT:    [[EXN:%.*]] = load i8*, i8** [[EXN_SLOT]], align 8, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[SEL:%.*]] = load i32, i32* [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { i8*, i32 } undef, i8* [[EXN]], 0, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    [[LPAD_VAL22:%.*]] = insertvalue { i8*, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG258]]
+// CHECK-TLS4-NEXT:    resume { i8*, i32 } [[LPAD_VAL22]], !dbg [[DBG258]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
-// CHECK-TLS4-SAME: (i8* noundef [[TMP0:%.*]]) #[[ATTR7]] !dbg [[DBG269:![0-9]+]] {
+// CHECK-TLS4-SAME: (i8* noundef [[TMP0:%.*]]) #[[ATTR7]] !dbg [[DBG270:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
 // CHECK-TLS4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i8** [[DOTADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
-// CHECK-TLS4-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG274]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i8** [[DOTADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]]
+// CHECK-TLS4-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG275]]
 // CHECK-TLS4:       arraydestroy.body:
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S1* [ getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 1, i64 0, i64 0), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG274]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG274]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]], !dbg [[DBG274]]
-// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i32 0, i32 0, i32 0), !dbg [[DBG274]]
-// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG274]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S1* [ getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i64 1, i64 0, i64 0), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG275]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG275]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S1D1Ev(%struct.S1* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]], !dbg [[DBG275]]
+// CHECK-TLS4-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S1* [[ARRAYDESTROY_ELEMENT]], getelementptr inbounds ([2 x [3 x %struct.S1]], [2 x [3 x %struct.S1]]* @arr_x, i32 0, i32 0, i32 0), !dbg [[DBG275]]
+// CHECK-TLS4-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG275]]
 // CHECK-TLS4:       arraydestroy.done1:
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG274]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG275]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainC2Ei
-// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] align 2 !dbg [[DBG275:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] align 2 !dbg [[DBG276:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG277:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG278:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG279:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG280:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG280:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG281:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG280]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG282:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG281:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG282:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 8, !dbg [[DBG281]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG283:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZZ4mainEN5SmainD2Ev
-// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] align 2 !dbg [[DBG283:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.Smain* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] align 2 !dbg [[DBG284:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.Smain*, align 8
 // CHECK-TLS4-NEXT:    store %struct.Smain* [[THIS]], %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG285:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.Smain** [[THIS_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG286:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.Smain*, %struct.Smain** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG286:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG288:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG289:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SMAIN:%.*]], %struct.Smain* [[THIS1]], i32 0, i32 0, !dbg [[DBG287:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 8, !dbg [[DBG289:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG290:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
-// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG290:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG291:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i8, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG291:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG291]]
-// CHECK-TLS4-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT_CHECK:%.*]], label [[INIT_END:%.*]], !dbg [[DBG291]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i8, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG292:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG292]]
+// CHECK-TLS4-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT_CHECK:%.*]], label [[INIT_END:%.*]], !dbg [[DBG292]]
 // CHECK-TLS4:       init.check:
-// CHECK-TLS4-NEXT:    call void @_ZN2S4C1Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) @_ZN2STI2S4E2stE, i32 noundef 23), !dbg [[DBG292:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S4*)* @_ZN2S4D1Ev to void (i8*)*), i8* bitcast (%struct.S4* @_ZN2STI2S4E2stE to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG291]]
-// CHECK-TLS4-NEXT:    store i8 1, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG291]]
-// CHECK-TLS4-NEXT:    br label [[INIT_END]], !dbg [[DBG291]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S4C1Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) @_ZN2STI2S4E2stE, i32 noundef 23), !dbg [[DBG293:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP1:%.*]] = call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void (%struct.S4*)* @_ZN2S4D1Ev to void (i8*)*), i8* bitcast (%struct.S4* @_ZN2STI2S4E2stE to i8*), i8* @__dso_handle) #[[ATTR5]], !dbg [[DBG292]]
+// CHECK-TLS4-NEXT:    store i8 1, i8* bitcast (i64* @_ZGVN2STI2S4E2stE to i8*), align 8, !dbg [[DBG292]]
+// CHECK-TLS4-NEXT:    br label [[INIT_END]], !dbg [[DBG292]]
 // CHECK-TLS4:       init.end:
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG294:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG295:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S4C1Ei
-// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG295:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 !dbg [[DBG296:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG298:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG299:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META300:![0-9]+]], metadata !DIExpression()), !dbg [[DBG301:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG301:![0-9]+]]
-// CHECK-TLS4-NEXT:    call void @_ZN2S4C2Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG301]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG302:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG302:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S4C2Ei(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]], i32 noundef [[TMP0]]), !dbg [[DBG302]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG303:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S4D1Ev
-// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG303:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG304:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META305:![0-9]+]], metadata !DIExpression()), !dbg [[DBG306:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @_ZN2S4D2Ev(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]], !dbg [[DBG306:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG307:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @_ZN2S4D2Ev(%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]], !dbg [[DBG307:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG308:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S4C2Ei
-// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG308:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG309:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-TLS4-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META309:![0-9]+]], metadata !DIExpression()), !dbg [[DBG310:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311:![0-9]+]]
 // CHECK-TLS4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG312:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG313:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG313:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG314:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG313]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG314:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG315:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 [[TMP0]], i32* [[A2]], align 4, !dbg [[DBG314]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG316:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_ZN2S4D2Ev
-// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG316:![0-9]+]] {
+// CHECK-TLS4-SAME: (%struct.S4* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 !dbg [[DBG317:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
 // CHECK-TLS4-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S4*, align 8
 // CHECK-TLS4-NEXT:    store %struct.S4* [[THIS]], %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META317:![0-9]+]], metadata !DIExpression()), !dbg [[DBG318:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @llvm.dbg.declare(metadata %struct.S4** [[THIS_ADDR]], metadata [[META318:![0-9]+]], metadata !DIExpression()), !dbg [[DBG319:![0-9]+]]
 // CHECK-TLS4-NEXT:    [[THIS1:%.*]] = load %struct.S4*, %struct.S4** [[THIS_ADDR]], align 8
-// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG319:![0-9]+]]
-// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG321:![0-9]+]]
-// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG322:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S4:%.*]], %struct.S4* [[THIS1]], i32 0, i32 0, !dbg [[DBG320:![0-9]+]]
+// CHECK-TLS4-NEXT:    store i32 0, i32* [[A]], align 4, !dbg [[DBG322:![0-9]+]]
+// CHECK-TLS4-NEXT:    ret void, !dbg [[DBG323:![0-9]+]]
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_threadprivate_codegen.cpp
-// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG323:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG324:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
-// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init.1(), !dbg [[DBG325:![0-9]+]]
+// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init.1(), !dbg [[DBG326:![0-9]+]]
 // CHECK-TLS4-NEXT:    ret void
 //
 //
 // CHECK-TLS4-LABEL: define {{[^@]+}}@__tls_init
-// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG326:![0-9]+]] {
+// CHECK-TLS4-SAME: () #[[ATTR7]] !dbg [[DBG327:![0-9]+]] {
 // CHECK-TLS4-NEXT:  entry:
-// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i8, i8* @__tls_guard, align 1, !dbg [[DBG327:![0-9]+]]
-// CHECK-TLS4-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG327]]
-// CHECK-TLS4-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT:%.*]], label [[EXIT:%.*]], !dbg [[DBG327]], !prof [[PROF119]]
+// CHECK-TLS4-NEXT:    [[TMP0:%.*]] = load i8, i8* @__tls_guard, align 1, !dbg [[DBG328:![0-9]+]]
+// CHECK-TLS4-NEXT:    [[GUARD_UNINITIALIZED:%.*]] = icmp eq i8 [[TMP0]], 0, !dbg [[DBG328]]
+// CHECK-TLS4-NEXT:    br i1 [[GUARD_UNINITIALIZED]], label [[INIT:%.*]], label [[EXIT:%.*]], !dbg [[DBG328]], !prof [[PROF119]]
 // CHECK-TLS4:       init:
-// CHECK-TLS4-NEXT:    store i8 1, i8* @__tls_guard, align 1, !dbg [[DBG327]]
-// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init(), !dbg [[DBG327]]
-// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init.2(), !dbg [[DBG327]]
-// CHECK-TLS4-NEXT:    br label [[EXIT]], !dbg [[DBG327]]
+// CHECK-TLS4-NEXT:    store i8 1, i8* @__tls_guard, align 1, !dbg [[DBG328]]
+// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init(), !dbg [[DBG328]]
+// CHECK-TLS4-NEXT:    call void @__cxx_global_var_init.2(), !dbg [[DBG328]]
+// CHECK-TLS4-NEXT:    br label [[EXIT]], !dbg [[DBG328]]
 // CHECK-TLS4:       exit:
 // CHECK-TLS4-NEXT:    ret void
 //


        


More information about the cfe-commits mailing list