[clang] [llvm] [Clang][OpenMP] Port clang codegen code for openmp porject (PR #85795)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 19 07:23:51 PDT 2024
https://github.com/TSWorld1314 created https://github.com/llvm/llvm-project/pull/85795
None
>From c44d8c1c7986fad6370273aca55e2db99d47387a Mon Sep 17 00:00:00 2001
From: "Harrison,Hao" <tsworld1314 at gmail.com>
Date: Tue, 19 Mar 2024 13:18:07 +0000
Subject: [PATCH] [Clang][OpenMP] Port clang codegen code for GPU First project
---
clang/include/clang/Basic/LangOptions.def | 1 +
clang/lib/CodeGen/CGBuilder.h | 9 +
clang/lib/CodeGen/CGDecl.cpp | 218 +++++++++---------
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 45 ++--
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 9 +
5 files changed, 156 insertions(+), 126 deletions(-)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8ef6700ecdc78ec..64b87ecdc97524c 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index bf5ab171d720d9b..fe5beff05134ac6 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -152,6 +152,15 @@ class CGBuilderTy : public CGBuilderBaseTy {
Addr.isKnownNonNull());
}
+ /// Cast the element type of the given address to a different type,
+ /// preserving information like the alignment and address space.
+ Address CreateElementBitCast(Address Addr, llvm::Type *Ty,
+ const llvm::Twine &Name = "") {
+ auto *PtrTy = Ty->getPointerTo(Addr.getAddressSpace());
+ return Address(CreateBitCast(Addr.getPointer(), PtrTy, Name), Ty,
+ Addr.getAlignment(), Addr.isKnownNonNull());
+ }
+
using CGBuilderBaseTy::CreatePointerBitCastOrAddrSpaceCast;
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty,
llvm::Type *ElementTy,
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index dc42faf8dbb9fda..691af33dc239d67 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2531,48 +2531,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
}
- Address DeclPtr = Address::invalid();
- Address AllocaPtr = Address::invalid();
- bool DoStore = false;
- bool IsScalar = hasScalarEvaluationKind(Ty);
- bool UseIndirectDebugAddress = false;
-
- // If we already have a pointer to the argument, reuse the input pointer.
- if (Arg.isIndirect()) {
- DeclPtr = Arg.getIndirectAddress();
- DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
- // Indirect argument is in alloca address space, which may be different
- // from the default address space.
- auto AllocaAS = CGM.getASTAllocaAddressSpace();
- auto *V = DeclPtr.getPointer();
- AllocaPtr = DeclPtr;
-
- // For truly ABI indirect arguments -- those that are not `byval` -- store
- // the address of the argument on the stack to preserve debug information.
- ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
- if (ArgInfo.isIndirect())
- UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
- if (UseIndirectDebugAddress) {
- auto PtrTy = getContext().getPointerType(Ty);
- AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
- D.getName() + ".indirect_addr");
- EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
- }
-
- auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
- auto DestLangAS =
- getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
- if (SrcLangAS != DestLangAS) {
- assert(getContext().getTargetAddressSpace(SrcLangAS) ==
- CGM.getDataLayout().getAllocaAddrSpace());
- auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
- auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
- DeclPtr =
- DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
- *this, V, SrcLangAS, DestLangAS, T, true),
- DeclPtr.isKnownNonNull());
- }
-
+ auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
// Push a destructor cleanup for this parameter if the ABI requires it.
// Don't push a cleanup in a thunk for a method that will also emit a
// cleanup.
@@ -2588,87 +2547,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
EHStack.stable_begin();
}
}
+ };
+
+ Address DeclPtr = Address::invalid();
+ Address AllocaPtr = Address::invalid();
+ Address OpenMPLocalAddr =
+ getLangOpts().OpenMP
+ ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+ : Address::invalid();
+ bool DoStore = false;
+ bool IsScalar = hasScalarEvaluationKind(Ty);
+ bool UseIndirectDebugAddress = false;
+ if (OpenMPLocalAddr.isValid()) {
+ DeclPtr = OpenMPLocalAddr;
+ AllocaPtr = DeclPtr;
+ LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+ if (Arg.isIndirect()) {
+ LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+ callCStructCopyConstructor(Dst, Src);
+ PushCleanupIfNeeded(Arg.getIndirectAddress());
+ } else {
+ EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+ }
} else {
- // Check if the parameter address is controlled by OpenMP runtime.
- Address OpenMPLocalAddr =
- getLangOpts().OpenMP
- ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
- : Address::invalid();
- if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
- DeclPtr = OpenMPLocalAddr;
+ // If we already have a pointer to the argument, reuse the input pointer.
+ if (Arg.isIndirect()) {
+ // If we have a prettier pointer type at this point, bitcast to that.
+ DeclPtr = Arg.getIndirectAddress();
+ DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
+ D.getName());
+ // Indirect argument is in alloca address space, which may be different
+ // from the default address space.
+ auto AllocaAS = CGM.getASTAllocaAddressSpace();
+ auto *V = DeclPtr.getPointer();
AllocaPtr = DeclPtr;
+ auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+ auto DestLangAS =
+ getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+ if (SrcLangAS != DestLangAS) {
+ assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+ CGM.getDataLayout().getAllocaAddrSpace());
+ auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+ auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
+ DeclPtr =
+ DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+ *this, V, SrcLangAS, DestLangAS, T, true),
+ DeclPtr.isKnownNonNull());
+ }
+ PushCleanupIfNeeded(DeclPtr);
} else {
- // Otherwise, create a temporary to hold the value.
+ // Create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr", &AllocaPtr);
+ DoStore = true;
}
- DoStore = true;
- }
-
- llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
-
- LValue lv = MakeAddrLValue(DeclPtr, Ty);
- if (IsScalar) {
- Qualifiers qs = Ty.getQualifiers();
- if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
- // We honor __attribute__((ns_consumed)) for types with lifetime.
- // For __strong, it's handled by just skipping the initial retain;
- // otherwise we have to balance out the initial +1 with an extra
- // cleanup to do the release at the end of the function.
- bool isConsumed = D.hasAttr<NSConsumedAttr>();
-
- // If a parameter is pseudo-strong then we can omit the implicit retain.
- if (D.isARCPseudoStrong()) {
- assert(lt == Qualifiers::OCL_Strong &&
- "pseudo-strong variable isn't strong?");
- assert(qs.hasConst() && "pseudo-strong variable should be const!");
- lt = Qualifiers::OCL_ExplicitNone;
- }
- // Load objects passed indirectly.
- if (Arg.isIndirect() && !ArgVal)
- ArgVal = Builder.CreateLoad(DeclPtr);
-
- if (lt == Qualifiers::OCL_Strong) {
- if (!isConsumed) {
- if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
- // use objc_storeStrong(&dest, value) for retaining the
- // object. But first, store a null into 'dest' because
- // objc_storeStrong attempts to release its old value.
- llvm::Value *Null = CGM.EmitNullConstant(D.getType());
- EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
- EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
- DoStore = false;
- }
- else
- // Don't use objc_retainBlock for block pointers, because we
- // don't want to Block_copy something just because we got it
- // as a parameter.
- ArgVal = EmitARCRetainNonBlock(ArgVal);
- }
- } else {
- // Push the cleanup for a consumed parameter.
- if (isConsumed) {
- ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
- ? ARCPreciseLifetime : ARCImpreciseLifetime);
- EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
- precise);
+ llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+
+ LValue lv = MakeAddrLValue(DeclPtr, Ty);
+ if (IsScalar) {
+ Qualifiers qs = Ty.getQualifiers();
+ if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+ // We honor __attribute__((ns_consumed)) for types with lifetime.
+ // For __strong, it's handled by just skipping the initial retain;
+ // otherwise we have to balance out the initial +1 with an extra
+ // cleanup to do the release at the end of the function.
+ bool isConsumed = D.hasAttr<NSConsumedAttr>();
+
+ // If a parameter is pseudo-strong then we can omit the implicit retain.
+ if (D.isARCPseudoStrong()) {
+ assert(lt == Qualifiers::OCL_Strong &&
+ "pseudo-strong variable isn't strong?");
+ assert(qs.hasConst() && "pseudo-strong variable should be const!");
+ lt = Qualifiers::OCL_ExplicitNone;
}
- if (lt == Qualifiers::OCL_Weak) {
- EmitARCInitWeak(DeclPtr, ArgVal);
- DoStore = false; // The weak init is a store, no need to do two.
+ // Load objects passed indirectly.
+ if (Arg.isIndirect() && !ArgVal)
+ ArgVal = Builder.CreateLoad(DeclPtr);
+
+ if (lt == Qualifiers::OCL_Strong) {
+ if (!isConsumed) {
+ if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+ // use objc_storeStrong(&dest, value) for retaining the
+ // object. But first, store a null into 'dest' because
+ // objc_storeStrong attempts to release its old value.
+ llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+ EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+ EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+ DoStore = false;
+ } else
+ // Don't use objc_retainBlock for block pointers, because we
+ // don't want to Block_copy something just because we got it
+ // as a parameter.
+ ArgVal = EmitARCRetainNonBlock(ArgVal);
+ }
+ } else {
+ // Push the cleanup for a consumed parameter.
+ if (isConsumed) {
+ ARCPreciseLifetime_t precise =
+ (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+ : ARCImpreciseLifetime);
+ EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+ ArgVal, precise);
+ }
+
+ if (lt == Qualifiers::OCL_Weak) {
+ EmitARCInitWeak(DeclPtr, ArgVal);
+ DoStore = false; // The weak init is a store, no need to do two.
+ }
}
- }
- // Enter the cleanup scope.
- EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+ // Enter the cleanup scope.
+ EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
+ }
}
- }
- // Store the initial value into the alloca.
- if (DoStore)
- EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+ // Store the initial value into the alloca.
+ if (DoStore)
+ EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+ }
setAddrOfLocalVar(&D, DeclPtr);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db0e..8f0c7caa2f3b4b0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
+ llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+ ? OMPRTL_malloc
+ : OMPRTL___kmpc_alloc_shared),
+ AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1149,10 +1151,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {Size};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
+ llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+ ? OMPRTL_malloc
+ : OMPRTL___kmpc_alloc_shared),
+ AllocArgs, VD->getName());
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
@@ -1178,20 +1182,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
// globalized in the prolog (i.e. emitGenericVarsProlog).
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- {AddrSizePair.first, AddrSizePair.second});
+ if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+ {AddrSizePair.first});
+ else
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_free_shared),
+ {AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
- llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
- CGF.getTypeSize(VD->getType())};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- FreeArgs);
+ if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+ {Rec.second.GlobalizedVal});
+ else
+ CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+ OMPRTL___kmpc_free_shared),
+ {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
}
}
}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00e..90d5d1973967493 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
__OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
__OMP_RTL(omp_get_max_threads, false, Int32, )
__OMP_RTL(omp_in_parallel, false, Int32, )
__OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+ Int32, VoidPtrPtr, Int64)
+
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
More information about the cfe-commits
mailing list