[clang] [llvm] [Clang][OpenMP] Port clang codegen code for openmp porject (PR #85795)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 21 08:40:50 PDT 2024
https://github.com/TSWorld1314 updated https://github.com/llvm/llvm-project/pull/85795
>From c44d8c1c7986fad6370273aca55e2db99d47387a Mon Sep 17 00:00:00 2001
From: "Harrison,Hao" <tsworld1314 at gmail.com>
Date: Tue, 19 Mar 2024 13:18:07 +0000
Subject: [PATCH 1/2] [Clang][OpenMP] Port clang codegen code for GPU First
project
---
clang/include/clang/Basic/LangOptions.def | 1 +
clang/lib/CodeGen/CGBuilder.h | 9 +
clang/lib/CodeGen/CGDecl.cpp | 218 +++++++++---------
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 45 ++--
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 9 +
5 files changed, 156 insertions(+), 126 deletions(-)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8ef6700ecdc78e..64b87ecdc97524 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index bf5ab171d720d9..fe5beff05134ac 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -152,6 +152,15 @@ class CGBuilderTy : public CGBuilderBaseTy {
Addr.isKnownNonNull());
}
+ /// Cast the element type of the given address to a different type,
+ /// preserving information like the alignment and address space.
+ Address CreateElementBitCast(Address Addr, llvm::Type *Ty,
+ const llvm::Twine &Name = "") {
+ auto *PtrTy = Ty->getPointerTo(Addr.getAddressSpace());
+ return Address(CreateBitCast(Addr.getPointer(), PtrTy, Name), Ty,
+ Addr.getAlignment(), Addr.isKnownNonNull());
+ }
+
using CGBuilderBaseTy::CreatePointerBitCastOrAddrSpaceCast;
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty,
llvm::Type *ElementTy,
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index dc42faf8dbb9fd..691af33dc239d6 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2531,48 +2531,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
}
- Address DeclPtr = Address::invalid();
- Address AllocaPtr = Address::invalid();
- bool DoStore = false;
- bool IsScalar = hasScalarEvaluationKind(Ty);
- bool UseIndirectDebugAddress = false;
-
- // If we already have a pointer to the argument, reuse the input pointer.
- if (Arg.isIndirect()) {
- DeclPtr = Arg.getIndirectAddress();
- DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
- // Indirect argument is in alloca address space, which may be different
- // from the default address space.
- auto AllocaAS = CGM.getASTAllocaAddressSpace();
- auto *V = DeclPtr.getPointer();
- AllocaPtr = DeclPtr;
-
- // For truly ABI indirect arguments -- those that are not `byval` -- store
- // the address of the argument on the stack to preserve debug information.
- ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
- if (ArgInfo.isIndirect())
- UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
- if (UseIndirectDebugAddress) {
- auto PtrTy = getContext().getPointerType(Ty);
- AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
- D.getName() + ".indirect_addr");
- EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
- }
-
- auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
- auto DestLangAS =
- getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
- if (SrcLangAS != DestLangAS) {
- assert(getContext().getTargetAddressSpace(SrcLangAS) ==
- CGM.getDataLayout().getAllocaAddrSpace());
- auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
- auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
- DeclPtr =
- DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
- *this, V, SrcLangAS, DestLangAS, T, true),
- DeclPtr.isKnownNonNull());
- }
-
+ auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
// Push a destructor cleanup for this parameter if the ABI requires it.
// Don't push a cleanup in a thunk for a method that will also emit a
// cleanup.
@@ -2588,87 +2547,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
EHStack.stable_begin();
}
}
+ };
+
+ Address DeclPtr = Address::invalid();
+ Address AllocaPtr = Address::invalid();
+ Address OpenMPLocalAddr =
+ getLangOpts().OpenMP
+ ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+ : Address::invalid();
+ bool DoStore = false;
+ bool IsScalar = hasScalarEvaluationKind(Ty);
+ bool UseIndirectDebugAddress = false;
+ if (OpenMPLocalAddr.isValid()) {
+ DeclPtr = OpenMPLocalAddr;
+ AllocaPtr = DeclPtr;
+ LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+ if (Arg.isIndirect()) {
+ LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+ callCStructCopyConstructor(Dst, Src);
+ PushCleanupIfNeeded(Arg.getIndirectAddress());
+ } else {
+ EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+ }
} else {
- // Check if the parameter address is controlled by OpenMP runtime.
- Address OpenMPLocalAddr =
- getLangOpts().OpenMP
- ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
- : Address::invalid();
- if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
- DeclPtr = OpenMPLocalAddr;
+ // If we already have a pointer to the argument, reuse the input pointer.
+ if (Arg.isIndirect()) {
+ // If we have a prettier pointer type at this point, bitcast to that.
+ DeclPtr = Arg.getIndirectAddress();
+ DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
+ D.getName());
+ // Indirect argument is in alloca address space, which may be different
+ // from the default address space.
+ auto AllocaAS = CGM.getASTAllocaAddressSpace();
+ auto *V = DeclPtr.getPointer();
AllocaPtr = DeclPtr;
+ auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+ auto DestLangAS =
+ getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+ if (SrcLangAS != DestLangAS) {
+ assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+ CGM.getDataLayout().getAllocaAddrSpace());
+ auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+ auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
+ DeclPtr =
+ DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+ *this, V, SrcLangAS, DestLangAS, T, true),
+ DeclPtr.isKnownNonNull());
+ }
+ PushCleanupIfNeeded(DeclPtr);
} else {
- // Otherwise, create a temporary to hold the value.
+ // Create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr", &AllocaPtr);
+ DoStore = true;
}
- DoStore = true;
- }
-
- llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
-
- LValue lv = MakeAddrLValue(DeclPtr, Ty);
- if (IsScalar) {
- Qualifiers qs = Ty.getQualifiers();
- if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
- // We honor __attribute__((ns_consumed)) for types with lifetime.
- // For __strong, it's handled by just skipping the initial retain;
- // otherwise we have to balance out the initial +1 with an extra
- // cleanup to do the release at the end of the function.
- bool isConsumed = D.hasAttr<NSConsumedAttr>();
-
- // If a parameter is pseudo-strong then we can omit the implicit retain.
- if (D.isARCPseudoStrong()) {
- assert(lt == Qualifiers::OCL_Strong &&
- "pseudo-strong variable isn't strong?");
- assert(qs.hasConst() && "pseudo-strong variable should be const!");
- lt = Qualifiers::OCL_ExplicitNone;
- }
- // Load objects passed indirectly.
- if (Arg.isIndirect() && !ArgVal)
- ArgVal = Builder.CreateLoad(DeclPtr);
-
- if (lt == Qualifiers::OCL_Strong) {
- if (!isConsumed) {
- if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
- // use objc_storeStrong(&dest, value) for retaining the
- // object. But first, store a null into 'dest' because
- // objc_storeStrong attempts to release its old value.
- llvm::Value *Null = CGM.EmitNullConstant(D.getType());
- EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
- EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
- DoStore = false;
- }
- else
- // Don't use objc_retainBlock for block pointers, because we
- // don't want to Block_copy something just because we got it
- // as a parameter.
- ArgVal = EmitARCRetainNonBlock(ArgVal);
- }
- } else {
- // Push the cleanup for a consumed parameter.
- if (isConsumed) {
- ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
- ? ARCPreciseLifetime : ARCImpreciseLifetime);
- EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
- precise);
+ llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+
+ LValue lv = MakeAddrLValue(DeclPtr, Ty);
+ if (IsScalar) {
+ Qualifiers qs = Ty.getQualifiers();
+ if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+ // We honor __attribute__((ns_consumed)) for types with lifetime.
+ // For __strong, it's handled by just skipping the initial retain;
+ // otherwise we have to balance out the initial +1 with an extra
+ // cleanup to do the release at the end of the function.
+ bool isConsumed = D.hasAttr<NSConsumedAttr>();
+
+ // If a parameter is pseudo-strong then we can omit the implicit retain.
+ if (D.isARCPseudoStrong()) {
+ assert(lt == Qualifiers::OCL_Strong &&
+ "pseudo-strong variable isn't strong?");
+ assert(qs.hasConst() && "pseudo-strong variable should be const!");
+ lt = Qualifiers::OCL_ExplicitNone;
}
- if (lt == Qualifiers::OCL_Weak) {
- EmitARCInitWeak(DeclPtr, ArgVal);
- DoStore = false; // The weak init is a store, no need to do two.
+ // Load objects passed indirectly.
+ if (Arg.isIndirect() && !ArgVal)
+ ArgVal = Builder.CreateLoad(DeclPtr);
+
+ if (lt == Qualifiers::OCL_Strong) {
+ if (!isConsumed) {
+ if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+ // use objc_storeStrong(&dest, value) for retaining the
+ // object. But first, store a null into 'dest' because
+ // objc_storeStrong attempts to release its old value.
+ llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+ EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+ EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+ DoStore = false;
+ } else
+ // Don't use objc_retainBlock for block pointers, because we
+ // don't want to Block_copy something just because we got it
+ // as a parameter.
+ ArgVal = EmitARCRetainNonBlock(ArgVal);
+ }
+ } else {
+ // Push the cleanup for a consumed parameter.
+ if (isConsumed) {
+ ARCPreciseLifetime_t precise =
+ (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+ : ARCImpreciseLifetime);
+ EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+ ArgVal, precise);
+ }
+
+ if (lt == Qualifiers::OCL_Weak) {
+ EmitARCInitWeak(DeclPtr, ArgVal);
+ DoStore = false; // The weak init is a store, no need to do two.
+ }
}
- }
- // Enter the cleanup scope.
- EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+ // Enter the cleanup scope.
+ EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+ }
}
- }
- // Store the initial value into the alloca.
- if (DoStore)
- EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+ // Store the initial value into the alloca.
+ if (DoStore)
+ EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+ }
setAddrOfLocalVar(&D, DeclPtr);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db0..8f0c7caa2f3b4b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
+ llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+ ? OMPRTL_malloc
+ : OMPRTL___kmpc_alloc_shared),
+ AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1149,10 +1151,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {Size};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
+ llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+ ? OMPRTL_malloc
+ : OMPRTL___kmpc_alloc_shared),
+ AllocArgs, VD->getName());
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
@@ -1178,20 +1182,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
// globalized in the prolog (i.e. emitGenericVarsProlog).
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- {AddrSizePair.first, AddrSizePair.second});
+ if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+ {AddrSizePair.first});
+ else
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_free_shared),
+ {AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
- llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
- CGF.getTypeSize(VD->getType())};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- FreeArgs);
+ if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+ {Rec.second.GlobalizedVal});
+ else
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_free_shared),
+ {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
}
}
}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00..90d5d197396749 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
__OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
__OMP_RTL(omp_get_max_threads, false, Int32, )
__OMP_RTL(omp_in_parallel, false, Int32, )
__OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+ Int32, VoidPtrPtr, Int64)
+
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
>From 8a74a4aaf069e90f4583c9da58bc3e9315e6bcfc Mon Sep 17 00:00:00 2001
From: "Harrison,Hao" <tsworld1314 at gmail.com>
Date: Thu, 21 Mar 2024 15:40:26 +0000
Subject: [PATCH 2/2] [Clang][OpenMP] Port the openmp of clang for GPU openmp
---
clang/include/clang/Driver/Options.td | 4 +
clang/lib/Driver/ToolChains/Clang.cpp | 2 +
.../ClangLinkerWrapper.cpp | 174 ++++++++++++++++--
.../llvm/Frontend/OpenMP/OMPDeviceConstants.h | 13 ++
llvm/include/llvm/LTO/Config.h | 3 +
llvm/include/llvm/LTO/LTO.h | 3 +
6 files changed, 188 insertions(+), 11 deletions(-)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4b1fcf1db1ad09..dd742ead7c0f25 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
HelpText<"Assert no nested parallel regions in the GPU">,
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+ HelpText<"Globalize to global space for the globalized variables">,
+ MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
} // let Group = f_Group
} // let Visibility = [ClangOption, CC1Option, FC1Option]
} // let Flags = [NoArgumentUnused, HelpHidden]
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 055884d275ce1b..92aadc8fd4ce63 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6519,6 +6519,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-offload-mandatory");
if (Args.hasArg(options::OPT_fopenmp_force_usm))
CmdArgs.push_back("-fopenmp-force-usm");
+ if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+ CmdArgs.push_back("-fopenmp-globalize-to-global-space");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index c60be2789bd61e..32a051799a6e35 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
#undef OPTION
};
+/// Host RPC module that will be shared to the corresponding pass.
+Module *HostModule = nullptr;
+/// We only need to generate the host RPC module once.
+bool IsHostModuleGenerated = false;
+/// Host RPC object file.
+StringRef HostRPCObjFile;
+
class WrapperOptTable : public opt::GenericOptTable {
public:
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -614,10 +621,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
return UnifiedFeatures;
}
-template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
+template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
+ typename PostHookTy = function_ref<bool(size_t, const Module &)>>
std::unique_ptr<lto::LTO> createLTO(
const ArgList &Args, const std::vector<std::string> &Features,
- ModuleHook Hook = [](size_t, const Module &) { return true; }) {
+ PreHookTy PreHook = [](size_t, const Module &) { return true; },
+ PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
// We need to remove AMD's target-id from the processor if present.
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
@@ -672,10 +681,10 @@ std::unique_ptr<lto::LTO> createLTO(
return true;
};
}
- Conf.PostOptModuleHook = Hook;
- Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
- ? CodeGenFileType::AssemblyFile
- : CodeGenFileType::ObjectFile;
+
+ Conf.PreOptModuleHook = PreHook;
+ Conf.PostOptModuleHook = PostHook;
+ Conf.CGFileType = Triple.isNVPTX() ? CodeGenFileType::AssemblyFile : CodeGenFileType::ObjectFile;
// TODO: Handle remark files
Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program);
@@ -691,6 +700,58 @@ bool isValidCIdentifier(StringRef S) {
[](char C) { return C == '_' || isAlnum(C); });
}
+bool writeHostModule(std::string &FileName) {
+ if (!HostModule)
+ return false;
+ if (HostModule->getFunctionList().empty())
+ return false;
+
+ auto HostTriple = HostModule->getTargetTriple();
+ FileName =
+ sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
+ auto TempFileOrErr = createOutputFile(FileName, "bc");
+ if (!TempFileOrErr)
+ reportError(TempFileOrErr.takeError());
+ int FD = -1;
+ if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+ reportError(errorCodeToError(EC));
+
+ auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
+ WriteBitcodeToFile(*HostModule, *Out);
+
+ return true;
+}
+
+std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
+ const llvm::Triple Triple(HostTriple);
+ lto::Config Conf;
+ lto::ThinBackend Backend;
+ Backend =
+ lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+
+ // TODO: host arch?
+ // Conf.CPU = Arch.str();
+ Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+ // TODO: host features?
+ // Conf.MAttrs = Features;
+ Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
+ Conf.OptLevel = 3;
+ Conf.UseDefaultPipeline = true;
+ Conf.DefaultTriple = Triple.getTriple();
+
+ LTOError = false;
+ Conf.DiagHandler = diagnosticHandler;
+
+ Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
+ Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+ Conf.CGFileType = CodeGenFileType::ObjectFile;
+
+ Conf.HasWholeProgramVisibility = false;
+
+ return std::make_unique<lto::LTO>(std::move(Conf), Backend);
+}
+
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
SmallVectorImpl<StringRef> &OutputFiles,
const ArgList &Args) {
@@ -776,14 +837,51 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
BitcodeOutput.push_back(*TempFileOrErr);
return false;
};
+ auto AddHostModuleAddr = [&](size_t, const Module &M) {
+ if (!HostModule)
+ return true;
+
+ Module &CM = const_cast<Module &>(M);
+ auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
+ MD->clearOperands();
+ MD->addOperand(MDTuple::get(
+ CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
+ Type::getInt64Ty(CM.getContext()),
+ reinterpret_cast<uintptr_t>(HostModule)))}));
+ return true;
+ };
// We assume visibility of the whole program if every input file was bitcode.
auto Features = getTargetFeatures(BitcodeInputFiles);
- auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
- Args.hasArg(OPT_builtin_bitcode_EQ) ||
- Args.hasArg(OPT_clang_backend)
- ? createLTO(Args, Features, OutputBitcode)
- : createLTO(Args, Features);
+ auto LTOBackend =
+ Args.hasArg(OPT_embed_bitcode)
+ ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
+ : createLTO(Args, Features, AddHostModuleAddr);
+
+ LLVMContext &Ctx = LTOBackend->getContext();
+ StringRef HostTriple =
+ Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
+ std::unique_ptr<Module> HostModulePtr;
+ if (!IsHostModuleGenerated) {
+ HostModulePtr = std::make_unique<Module>(
+ sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
+ HostModule = HostModulePtr.get();
+ HostModulePtr->setTargetTriple(HostTriple);
+
+ std::string Msg;
+ const Target *T =
+ TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
+ if (!T)
+ return createStringError(inconvertibleErrorCode(), Msg);
+ auto Options =
+ codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
+ StringRef CPU = "";
+ StringRef Features = "";
+ std::unique_ptr<TargetMachine> TM(
+ T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
+ HostModule->getCodeModel()));
+ HostModule->setDataLayout(TM->createDataLayout());
+ }
// We need to resolve the symbols so the LTO backend knows which symbols need
// to be kept or can be internalized. This is a simplified symbol resolution
@@ -877,6 +975,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
if (Error Err = LTOBackend->run(AddStream))
return Err;
+ std::string HostModuleTempFile;
+ bool ValidHostModule = writeHostModule(HostModuleTempFile);
+ // Reset the HostModule pointer.
+ HostModulePtr.reset();
+ HostModule = nullptr;
+ // TODO: this is really redundant code.
+ if (ValidHostModule) {
+ auto HostLTO = createHostRPCLTO(HostTriple);
+
+ std::string HostBitCodeFile = HostModuleTempFile + ".bc";
+ auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
+ if (!BufferOrError)
+ reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
+ Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+ llvm::lto::InputFile::create(*BufferOrError.get());
+ if (!BitcodeFileOrErr)
+ return BitcodeFileOrErr.takeError();
+
+ const auto Symbols = (*BitcodeFileOrErr)->symbols();
+ SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+ size_t Idx = 0;
+ for (auto &Sym : Symbols) {
+ (void)Sym;
+ lto::SymbolResolution &Res = Resolutions[Idx++];
+ Res.ExportDynamic = true;
+ Res.VisibleToRegularObj = true;
+ Res.LinkerRedefined = false;
+ Res.Prevailing = true;
+ }
+ if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
+ return Err;
+
+ auto RPCAddStream =
+ [&](size_t Task,
+ const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+ int FD = -1;
+ auto TempFileOrErr = createOutputFile(
+ sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
+ if (!TempFileOrErr)
+ reportError(TempFileOrErr.takeError());
+ HostRPCObjFile = *TempFileOrErr;
+ if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+ reportError(errorCodeToError(EC));
+ return std::make_unique<CachedFileStream>(
+ std::make_unique<llvm::raw_fd_ostream>(FD, true));
+ };
+
+ if (Error Err = HostLTO->run(RPCAddStream))
+ return Err;
+ }
+
if (LTOError)
return createStringError(inconvertibleErrorCode(),
"Errors encountered inside the LTO pipeline.");
@@ -1245,6 +1394,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
WrappedOutput.push_back(*OutputOrErr);
}
+ if (!HostRPCObjFile.empty())
+ WrappedOutput.push_back(HostRPCObjFile);
+
return WrappedOutput;
}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
index ccf8e727c40454..811ca72b576e66 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
};
+enum OMPTgtHostRPCArgType {
+ // No need to copy.
+ OMP_HOST_RPC_ARG_SCALAR = 0,
+ OMP_HOST_RPC_ARG_PTR = 1,
+ // Copy to device.
+ OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
+ // Copy to device.
+ OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
+ // Copy to and from device.
+ OMP_HOST_RPC_ARG_COPY_TOFROM =
+ OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
+};
+
} // end namespace omp
} // end namespace llvm
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 482b6e55a19d35..fe70dce082e1d6 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,6 +60,9 @@ struct Config {
bool VerifyEach = false;
bool DisableVerify = false;
+ /// Use the standard optimization pipeline.
+ bool UseDefaultPipeline = false;
+
/// Flag to indicate that the optimizer should not assume builtins are present
/// on the target.
bool Freestanding = false;
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 94996ae89e35d0..431ef3a09e07ac 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -303,6 +303,9 @@ class LTO {
/// by LTO but might not be visible from bitcode symbol table.
static ArrayRef<const char*> getRuntimeLibcallSymbols();
+ /// Returns the context.
+ LLVMContext &getContext() { return RegularLTO.Ctx; }
+
private:
Config Conf;
More information about the llvm-commits
mailing list