[clang] [llvm] [Clang][OpenMP] Port clang codegen code for openmp porject (PR #85795)

Thu Mar 21 08:40:50 PDT 2024

https://github.com/TSWorld1314 updated https://github.com/llvm/llvm-project/pull/85795

>From c44d8c1c7986fad6370273aca55e2db99d47387a Mon Sep 17 00:00:00 2001
From: "Harrison,Hao" <tsworld1314 at gmail.com>
Date: Tue, 19 Mar 2024 13:18:07 +0000
Subject: [PATCH 1/2] [Clang][OpenMP] Port clang codegen code for GPU First
 project

---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/lib/CodeGen/CGBuilder.h                 |   9 +
 clang/lib/CodeGen/CGDecl.cpp                  | 218 +++++++++---------
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  45 ++--
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   9 +
 5 files changed, 156 insertions(+), 126 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8ef6700ecdc78e..64b87ecdc97524 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace  , 1, 0, "Globalize to global space for the globalized variables")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index bf5ab171d720d9..fe5beff05134ac 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -152,6 +152,15 @@ class CGBuilderTy : public CGBuilderBaseTy {
                             Addr.isKnownNonNull());
   }
 
+  /// Cast the element type of the given address to a different type,
+  /// preserving information like the alignment and address space.
+  Address CreateElementBitCast(Address Addr, llvm::Type *Ty,
+                               const llvm::Twine &Name = "") {
+    auto *PtrTy = Ty->getPointerTo(Addr.getAddressSpace());
+    return Address(CreateBitCast(Addr.getPointer(), PtrTy, Name), Ty,
+                   Addr.getAlignment(), Addr.isKnownNonNull());
+  }
+  
   using CGBuilderBaseTy::CreatePointerBitCastOrAddrSpaceCast;
   Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty,
                                               llvm::Type *ElementTy,
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index dc42faf8dbb9fd..691af33dc239d6 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2531,48 +2531,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
         (IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
   }
 
-  Address DeclPtr = Address::invalid();
-  Address AllocaPtr = Address::invalid();
-  bool DoStore = false;
-  bool IsScalar = hasScalarEvaluationKind(Ty);
-  bool UseIndirectDebugAddress = false;
-
-  // If we already have a pointer to the argument, reuse the input pointer.
-  if (Arg.isIndirect()) {
-    DeclPtr = Arg.getIndirectAddress();
-    DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
-    // Indirect argument is in alloca address space, which may be different
-    // from the default address space.
-    auto AllocaAS = CGM.getASTAllocaAddressSpace();
-    auto *V = DeclPtr.getPointer();
-    AllocaPtr = DeclPtr;
-
-    // For truly ABI indirect arguments -- those that are not `byval` -- store
-    // the address of the argument on the stack to preserve debug information.
-    ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
-    if (ArgInfo.isIndirect())
-      UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
-    if (UseIndirectDebugAddress) {
-      auto PtrTy = getContext().getPointerType(Ty);
-      AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
-                                D.getName() + ".indirect_addr");
-      EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
-    }
-
-    auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
-    auto DestLangAS =
-        getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
-    if (SrcLangAS != DestLangAS) {
-      assert(getContext().getTargetAddressSpace(SrcLangAS) ==
-             CGM.getDataLayout().getAllocaAddrSpace());
-      auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
-      auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
-      DeclPtr =
-          DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
-                                  *this, V, SrcLangAS, DestLangAS, T, true),
-                              DeclPtr.isKnownNonNull());
-    }
-
+  auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
     // Push a destructor cleanup for this parameter if the ABI requires it.
     // Don't push a cleanup in a thunk for a method that will also emit a
     // cleanup.
@@ -2588,87 +2547,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
             EHStack.stable_begin();
       }
     }
+  };
+
+  Address DeclPtr = Address::invalid();
+  Address AllocaPtr = Address::invalid();
+  Address OpenMPLocalAddr =
+      getLangOpts().OpenMP
+          ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+          : Address::invalid();
+  bool DoStore = false;
+  bool IsScalar = hasScalarEvaluationKind(Ty);
+  bool UseIndirectDebugAddress = false;
+  if (OpenMPLocalAddr.isValid()) {
+    DeclPtr = OpenMPLocalAddr;
+    AllocaPtr = DeclPtr;
+    LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+    if (Arg.isIndirect()) {
+      LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+      callCStructCopyConstructor(Dst, Src);
+      PushCleanupIfNeeded(Arg.getIndirectAddress());
+    } else {
+      EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+    }
   } else {
-    // Check if the parameter address is controlled by OpenMP runtime.
-    Address OpenMPLocalAddr =
-        getLangOpts().OpenMP
-            ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
-            : Address::invalid();
-    if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
-      DeclPtr = OpenMPLocalAddr;
+    // If we already have a pointer to the argument, reuse the input pointer.
+    if (Arg.isIndirect()) {
+      // If we have a prettier pointer type at this point, bitcast to that.
+      DeclPtr = Arg.getIndirectAddress();
+      DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
+                                             D.getName());
+      // Indirect argument is in alloca address space, which may be different
+      // from the default address space.
+      auto AllocaAS = CGM.getASTAllocaAddressSpace();
+      auto *V = DeclPtr.getPointer();
       AllocaPtr = DeclPtr;
+      auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+      auto DestLangAS =
+          getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+      if (SrcLangAS != DestLangAS) {
+        assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+               CGM.getDataLayout().getAllocaAddrSpace());
+        auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+        auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
+        DeclPtr =
+            DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+                                    *this, V, SrcLangAS, DestLangAS, T, true),
+                                DeclPtr.isKnownNonNull());
+      }
+      PushCleanupIfNeeded(DeclPtr);
     } else {
-      // Otherwise, create a temporary to hold the value.
+      // Create a temporary to hold the value.
       DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
                               D.getName() + ".addr", &AllocaPtr);
+      DoStore = true;
     }
-    DoStore = true;
-  }
-
-  llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
-
-  LValue lv = MakeAddrLValue(DeclPtr, Ty);
-  if (IsScalar) {
-    Qualifiers qs = Ty.getQualifiers();
-    if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
-      // We honor __attribute__((ns_consumed)) for types with lifetime.
-      // For __strong, it's handled by just skipping the initial retain;
-      // otherwise we have to balance out the initial +1 with an extra
-      // cleanup to do the release at the end of the function.
-      bool isConsumed = D.hasAttr<NSConsumedAttr>();
-
-      // If a parameter is pseudo-strong then we can omit the implicit retain.
-      if (D.isARCPseudoStrong()) {
-        assert(lt == Qualifiers::OCL_Strong &&
-               "pseudo-strong variable isn't strong?");
-        assert(qs.hasConst() && "pseudo-strong variable should be const!");
-        lt = Qualifiers::OCL_ExplicitNone;
-      }
 
-      // Load objects passed indirectly.
-      if (Arg.isIndirect() && !ArgVal)
-        ArgVal = Builder.CreateLoad(DeclPtr);
-
-      if (lt == Qualifiers::OCL_Strong) {
-        if (!isConsumed) {
-          if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
-            // use objc_storeStrong(&dest, value) for retaining the
-            // object. But first, store a null into 'dest' because
-            // objc_storeStrong attempts to release its old value.
-            llvm::Value *Null = CGM.EmitNullConstant(D.getType());
-            EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
-            EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
-            DoStore = false;
-          }
-          else
-          // Don't use objc_retainBlock for block pointers, because we
-          // don't want to Block_copy something just because we got it
-          // as a parameter.
-            ArgVal = EmitARCRetainNonBlock(ArgVal);
-        }
-      } else {
-        // Push the cleanup for a consumed parameter.
-        if (isConsumed) {
-          ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
-                                ? ARCPreciseLifetime : ARCImpreciseLifetime);
-          EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
-                                                   precise);
+    llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+
+    LValue lv = MakeAddrLValue(DeclPtr, Ty);
+    if (IsScalar) {
+      Qualifiers qs = Ty.getQualifiers();
+      if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+        // We honor __attribute__((ns_consumed)) for types with lifetime.
+        // For __strong, it's handled by just skipping the initial retain;
+        // otherwise we have to balance out the initial +1 with an extra
+        // cleanup to do the release at the end of the function.
+        bool isConsumed = D.hasAttr<NSConsumedAttr>();
+
+        // If a parameter is pseudo-strong then we can omit the implicit retain.
+        if (D.isARCPseudoStrong()) {
+          assert(lt == Qualifiers::OCL_Strong &&
+                 "pseudo-strong variable isn't strong?");
+          assert(qs.hasConst() && "pseudo-strong variable should be const!");
+          lt = Qualifiers::OCL_ExplicitNone;
         }
 
-        if (lt == Qualifiers::OCL_Weak) {
-          EmitARCInitWeak(DeclPtr, ArgVal);
-          DoStore = false; // The weak init is a store, no need to do two.
+        // Load objects passed indirectly.
+        if (Arg.isIndirect() && !ArgVal)
+          ArgVal = Builder.CreateLoad(DeclPtr);
+
+        if (lt == Qualifiers::OCL_Strong) {
+          if (!isConsumed) {
+            if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+              // use objc_storeStrong(&dest, value) for retaining the
+              // object. But first, store a null into 'dest' because
+              // objc_storeStrong attempts to release its old value.
+              llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+              EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+              EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+              DoStore = false;
+            } else
+              // Don't use objc_retainBlock for block pointers, because we
+              // don't want to Block_copy something just because we got it
+              // as a parameter.
+              ArgVal = EmitARCRetainNonBlock(ArgVal);
+          }
+        } else {
+          // Push the cleanup for a consumed parameter.
+          if (isConsumed) {
+            ARCPreciseLifetime_t precise =
+                (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+                                                      : ARCImpreciseLifetime);
+            EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+                                                     ArgVal, precise);
+          }
+
+          if (lt == Qualifiers::OCL_Weak) {
+            EmitARCInitWeak(DeclPtr, ArgVal);
+            DoStore = false; // The weak init is a store, no need to do two.
+          }
         }
-      }
 
-      // Enter the cleanup scope.
-      EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+        // Enter the cleanup scope.
+        EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+      }
     }
-  }
 
-  // Store the initial value into the alloca.
-  if (DoStore)
-    EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+    // Store the initial value into the alloca.
+    if (DoStore)
+      EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+  }
 
   setAddrOfLocalVar(&D, DeclPtr);
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db0..8f0c7caa2f3b4b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
 
     // Allocate space for the variable to be globalized
     llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
     // FIXME: We should use the variables actual alignment as an argument.
     VoidPtr->addRetAttr(llvm::Attribute::get(
         CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1149,10 +1151,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
 
   // Allocate space for this VLA object to be globalized.
   llvm::Value *AllocArgs[] = {Size};
-  llvm::CallBase *VoidPtr =
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                          AllocArgs, VD->getName());
+  llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
   VoidPtr->addRetAttr(llvm::Attribute::get(
       CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
 
@@ -1178,20 +1182,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
     // globalized in the prolog (i.e. emitGenericVarsProlog).
     for (const auto &AddrSizePair :
          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          {AddrSizePair.first, AddrSizePair.second});
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {AddrSizePair.first});
+      else
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_free_shared),
+                            {AddrSizePair.first, AddrSizePair.second});
     }
     // Deallocate the memory for each globalized value
     for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
       const auto *VD = cast<VarDecl>(Rec.first);
       I->getSecond().MappedParams->restore(CGF);
 
-      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
-                                 CGF.getTypeSize(VD->getType())};
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          FreeArgs);
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {Rec.second.GlobalizedVal});
+      else
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                                  OMPRTL___kmpc_free_shared),
+            {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
     }
   }
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00..90d5d197396749 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )
 
 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
 __OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
 
 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
 
+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+          Int32, VoidPtrPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
 
 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())

>From 8a74a4aaf069e90f4583c9da58bc3e9315e6bcfc Mon Sep 17 00:00:00 2001
From: "Harrison,Hao" <tsworld1314 at gmail.com>
Date: Thu, 21 Mar 2024 15:40:26 +0000
Subject: [PATCH 2/2] [Clang][OpenMP] Port the openmp of clang for GPU openmp

---
 clang/include/clang/Driver/Options.td         |   4 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   2 +
 .../ClangLinkerWrapper.cpp                    | 174 ++++++++++++++++--
 .../llvm/Frontend/OpenMP/OMPDeviceConstants.h |  13 ++
 llvm/include/llvm/LTO/Config.h                |   3 +
 llvm/include/llvm/LTO/LTO.h                   |   3 +
 6 files changed, 188 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4b1fcf1db1ad09..dd742ead7c0f25 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
   HelpText<"Assert no nested parallel regions in the GPU">,
   MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
 
+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+  HelpText<"Globalize to global space for the globalized variables">,
+  MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
 } // let Group = f_Group
 } // let Visibility = [ClangOption, CC1Option, FC1Option]
 } // let Flags = [NoArgumentUnused, HelpHidden]
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 055884d275ce1b..92aadc8fd4ce63 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6519,6 +6519,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fopenmp-offload-mandatory");
       if (Args.hasArg(options::OPT_fopenmp_force_usm))
         CmdArgs.push_back("-fopenmp-force-usm");
+      if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+        CmdArgs.push_back("-fopenmp-globalize-to-global-space");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index c60be2789bd61e..32a051799a6e35 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
 #undef OPTION
 };
 
+/// Host RPC module that will be shared to the corresponding pass.
+Module *HostModule = nullptr;
+/// We only need to generate the host RPC module once.
+bool IsHostModuleGenerated = false;
+/// Host RPC object file.
+StringRef HostRPCObjFile;
+
 class WrapperOptTable : public opt::GenericOptTable {
 public:
   WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -614,10 +621,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
   return UnifiedFeatures;
 }
 
-template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
+template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
+          typename PostHookTy = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
     const ArgList &Args, const std::vector<std::string> &Features,
-    ModuleHook Hook = [](size_t, const Module &) { return true; }) {
+    PreHookTy PreHook = [](size_t, const Module &) { return true; },
+    PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   // We need to remove AMD's target-id from the processor if present.
   StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
@@ -672,10 +681,10 @@ std::unique_ptr<lto::LTO> createLTO(
       return true;
     };
   }
-  Conf.PostOptModuleHook = Hook;
-  Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
-                        ? CodeGenFileType::AssemblyFile
-                        : CodeGenFileType::ObjectFile;
+
+  Conf.PreOptModuleHook = PreHook;
+  Conf.PostOptModuleHook = PostHook;
+  Conf.CGFileType = Triple.isNVPTX() ? CodeGenFileType::AssemblyFile : CodeGenFileType::ObjectFile;
 
   // TODO: Handle remark files
   Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program);
@@ -691,6 +700,58 @@ bool isValidCIdentifier(StringRef S) {
                       [](char C) { return C == '_' || isAlnum(C); });
 }
 
+bool writeHostModule(std::string &FileName) {
+  if (!HostModule)
+    return false;
+  if (HostModule->getFunctionList().empty())
+    return false;
+
+  auto HostTriple = HostModule->getTargetTriple();
+  FileName =
+      sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
+  auto TempFileOrErr = createOutputFile(FileName, "bc");
+  if (!TempFileOrErr)
+    reportError(TempFileOrErr.takeError());
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+    reportError(errorCodeToError(EC));
+
+  auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
+  WriteBitcodeToFile(*HostModule, *Out);
+
+  return true;
+}
+
+std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
+  const llvm::Triple Triple(HostTriple);
+  lto::Config Conf;
+  lto::ThinBackend Backend;
+  Backend =
+      lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+
+  // TODO: host arch?
+  // Conf.CPU = Arch.str();
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+  // TODO: host features?
+  // Conf.MAttrs = Features;
+  Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
+  Conf.OptLevel = 3;
+  Conf.UseDefaultPipeline = true;
+  Conf.DefaultTriple = Triple.getTriple();
+
+  LTOError = false;
+  Conf.DiagHandler = diagnosticHandler;
+
+  Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
+  Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+  Conf.CGFileType = CodeGenFileType::ObjectFile;
+
+  Conf.HasWholeProgramVisibility = false;
+
+  return std::make_unique<lto::LTO>(std::move(Conf), Backend);
+}
+
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                        SmallVectorImpl<StringRef> &OutputFiles,
                        const ArgList &Args) {
@@ -776,14 +837,51 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     BitcodeOutput.push_back(*TempFileOrErr);
     return false;
   };
+  auto AddHostModuleAddr = [&](size_t, const Module &M) {
+    if (!HostModule)
+      return true;
+
+    Module &CM = const_cast<Module &>(M);
+    auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
+    MD->clearOperands();
+    MD->addOperand(MDTuple::get(
+        CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
+                             Type::getInt64Ty(CM.getContext()),
+                             reinterpret_cast<uintptr_t>(HostModule)))}));
+    return true;
+  };
 
   // We assume visibility of the whole program if every input file was bitcode.
   auto Features = getTargetFeatures(BitcodeInputFiles);
-  auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
-                            Args.hasArg(OPT_builtin_bitcode_EQ) ||
-                            Args.hasArg(OPT_clang_backend)
-                        ? createLTO(Args, Features, OutputBitcode)
-                        : createLTO(Args, Features);
+    auto LTOBackend =
+      Args.hasArg(OPT_embed_bitcode)
+          ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
+          : createLTO(Args, Features, AddHostModuleAddr);
+
+  LLVMContext &Ctx = LTOBackend->getContext();
+  StringRef HostTriple =
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
+  std::unique_ptr<Module> HostModulePtr;
+  if (!IsHostModuleGenerated) {
+    HostModulePtr = std::make_unique<Module>(
+        sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
+    HostModule = HostModulePtr.get();
+    HostModulePtr->setTargetTriple(HostTriple);
+
+    std::string Msg;
+    const Target *T =
+        TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
+    if (!T)
+      return createStringError(inconvertibleErrorCode(), Msg);
+    auto Options =
+        codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
+    StringRef CPU = "";
+    StringRef Features = "";
+    std::unique_ptr<TargetMachine> TM(
+        T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
+                               HostModule->getCodeModel()));
+    HostModule->setDataLayout(TM->createDataLayout());
+  }
 
   // We need to resolve the symbols so the LTO backend knows which symbols need
   // to be kept or can be internalized. This is a simplified symbol resolution
@@ -877,6 +975,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
   if (Error Err = LTOBackend->run(AddStream))
     return Err;
 
+  std::string HostModuleTempFile;
+  bool ValidHostModule = writeHostModule(HostModuleTempFile);
+  // Reset the HostModule pointer.
+  HostModulePtr.reset();
+  HostModule = nullptr;
+  // TODO: this is really redundant code.
+  if (ValidHostModule) {
+    auto HostLTO = createHostRPCLTO(HostTriple);
+
+    std::string HostBitCodeFile = HostModuleTempFile + ".bc";
+    auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
+    if (!BufferOrError)
+      reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
+    Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+        llvm::lto::InputFile::create(*BufferOrError.get());
+    if (!BitcodeFileOrErr)
+      return BitcodeFileOrErr.takeError();
+
+    const auto Symbols = (*BitcodeFileOrErr)->symbols();
+    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+    size_t Idx = 0;
+    for (auto &Sym : Symbols) {
+      (void)Sym;
+      lto::SymbolResolution &Res = Resolutions[Idx++];
+      Res.ExportDynamic = true;
+      Res.VisibleToRegularObj = true;
+      Res.LinkerRedefined = false;
+      Res.Prevailing = true;
+    }
+    if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
+      return Err;
+
+    auto RPCAddStream =
+        [&](size_t Task,
+            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+      int FD = -1;
+      auto TempFileOrErr = createOutputFile(
+          sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
+      if (!TempFileOrErr)
+        reportError(TempFileOrErr.takeError());
+      HostRPCObjFile = *TempFileOrErr;
+      if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+        reportError(errorCodeToError(EC));
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+    };
+
+    if (Error Err = HostLTO->run(RPCAddStream))
+      return Err;
+  }
+
   if (LTOError)
     return createStringError(inconvertibleErrorCode(),
                              "Errors encountered inside the LTO pipeline.");
@@ -1245,6 +1394,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
     WrappedOutput.push_back(*OutputOrErr);
   }
 
+  if (!HostRPCObjFile.empty())
+    WrappedOutput.push_back(HostRPCObjFile);
+
   return WrappedOutput;
 }
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
index ccf8e727c40454..811ca72b576e66 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
       OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
 };
 
+enum OMPTgtHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
+  // Copy to and from device.
+  OMP_HOST_RPC_ARG_COPY_TOFROM =
+      OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
+};
+
 } // end namespace omp
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 482b6e55a19d35..fe70dce082e1d6 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,6 +60,9 @@ struct Config {
   bool VerifyEach = false;
   bool DisableVerify = false;
 
+  /// Use the standard optimization pipeline.
+  bool UseDefaultPipeline = false;
+  
   /// Flag to indicate that the optimizer should not assume builtins are present
   /// on the target.
   bool Freestanding = false;
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 94996ae89e35d0..431ef3a09e07ac 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -303,6 +303,9 @@ class LTO {
   /// by LTO but might not be visible from bitcode symbol table.
   static ArrayRef<const char*> getRuntimeLibcallSymbols();
 
+  /// Returns the context.
+  LLVMContext &getContext() { return RegularLTO.Ctx; }
+
 private:
   Config Conf;