r331642 - [OPENMP, NVPTX] Added support for L2 parallelism.
Alexey Bataev via cfe-commits
cfe-commits at lists.llvm.org
Mon May 7 07:50:06 PDT 2018
Author: abataev
Date: Mon May 7 07:50:05 2018
New Revision: 331642
URL: http://llvm.org/viewvc/llvm-project?rev=331642&view=rev
Log:
[OPENMP, NVPTX] Added support for L2 parallelism.
Added initial codegen for level 2, 3 etc. parallelism. Currently, all
the second, the third etc. parallel regions will run sequentially.
Modified:
cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp
cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
cfe/trunk/lib/CodeGen/CodeGenModule.cpp
cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp Mon May 7 07:50:05 2018
@@ -2764,13 +2764,6 @@ Address CGOpenMPRuntime::getAddrOfArtifi
CGM.getPointerAlign());
}
-/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
-/// function. Here is the logic:
-/// if (Cond) {
-/// ThenGen();
-/// } else {
-/// ElseGen();
-/// }
void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
const RegionCodeGenTy &ThenGen,
const RegionCodeGenTy &ElseGen) {
Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Mon May 7 07:50:05 2018
@@ -93,6 +93,9 @@ enum OpenMPRTLFunctionNVPTX {
OMPRTL_NVPTX__kmpc_end_sharing_variables,
/// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
OMPRTL_NVPTX__kmpc_get_shared_variables,
+ /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
+ /// global_tid);
+ OMPRTL_NVPTX__kmpc_parallel_level,
};
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -131,19 +134,17 @@ public:
}
};
-// A class to track the execution mode when codegening directives within
-// a target region. The appropriate mode (generic/spmd) is set on entry
-// to the target region and used by containing directives such as 'parallel'
-// to emit optimized code.
+/// A class to track the execution mode when codegening directives within
+/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
+/// to the target region and used by containing directives such as 'parallel'
+/// to emit optimized code.
class ExecutionModeRAII {
private:
- CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
- CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+ bool SavedMode;
+ bool &Mode;
public:
- ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
- CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
- : Mode(Mode) {
+ ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) {
SavedMode = Mode;
Mode = NewMode;
}
@@ -579,24 +580,171 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctio
}
bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
- return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+ return IsInSPMDExecutionMode;
}
-static CGOpenMPRuntimeNVPTX::ExecutionMode
-getExecutionMode(CodeGenModule &CGM) {
- return CGM.getLangOpts().OpenMPCUDAMode
- ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
- : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+static CGOpenMPRuntimeNVPTX::DataSharingMode
+getDataSharingMode(CodeGenModule &CGM) {
+ return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
+ : CGOpenMPRuntimeNVPTX::Generic;
+}
+
+/// Check for inner (nested) SPMD construct, if any
+static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
+ const auto *CS = D.getCapturedStmt(OMPD_target);
+ const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
+ const Stmt *ChildStmt = nullptr;
+ if (const auto *C = dyn_cast<CompoundStmt>(Body))
+ if (C->size() == 1)
+ ChildStmt = C->body_front();
+ if (!ChildStmt)
+ return false;
+
+ if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+ OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
+ // TODO: add further analysis for inner teams|distribute directives, if any.
+ switch (D.getDirectiveKind()) {
+ case OMPD_target:
+ return (isOpenMPParallelDirective(DKind) &&
+ !isOpenMPTeamsDirective(DKind) &&
+ !isOpenMPDistributeDirective(DKind)) ||
+ isOpenMPSimdDirective(DKind) ||
+ DKind == OMPD_teams_distribute_parallel_for;
+ case OMPD_target_teams:
+ return (isOpenMPParallelDirective(DKind) &&
+ !isOpenMPDistributeDirective(DKind)) ||
+ isOpenMPSimdDirective(DKind) ||
+ DKind == OMPD_distribute_parallel_for;
+ case OMPD_target_teams_distribute:
+ return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind);
+ case OMPD_target_simd:
+ case OMPD_target_parallel:
+ case OMPD_target_parallel_for:
+ case OMPD_target_parallel_for_simd:
+ case OMPD_target_teams_distribute_simd:
+ case OMPD_target_teams_distribute_parallel_for:
+ case OMPD_target_teams_distribute_parallel_for_simd:
+ case OMPD_parallel:
+ case OMPD_for:
+ case OMPD_parallel_for:
+ case OMPD_parallel_sections:
+ case OMPD_for_simd:
+ case OMPD_parallel_for_simd:
+ case OMPD_cancel:
+ case OMPD_cancellation_point:
+ case OMPD_ordered:
+ case OMPD_threadprivate:
+ case OMPD_task:
+ case OMPD_simd:
+ case OMPD_sections:
+ case OMPD_section:
+ case OMPD_single:
+ case OMPD_master:
+ case OMPD_critical:
+ case OMPD_taskyield:
+ case OMPD_barrier:
+ case OMPD_taskwait:
+ case OMPD_taskgroup:
+ case OMPD_atomic:
+ case OMPD_flush:
+ case OMPD_teams:
+ case OMPD_target_data:
+ case OMPD_target_exit_data:
+ case OMPD_target_enter_data:
+ case OMPD_distribute:
+ case OMPD_distribute_simd:
+ case OMPD_distribute_parallel_for:
+ case OMPD_distribute_parallel_for_simd:
+ case OMPD_teams_distribute:
+ case OMPD_teams_distribute_simd:
+ case OMPD_teams_distribute_parallel_for:
+ case OMPD_teams_distribute_parallel_for_simd:
+ case OMPD_target_update:
+ case OMPD_declare_simd:
+ case OMPD_declare_target:
+ case OMPD_end_declare_target:
+ case OMPD_declare_reduction:
+ case OMPD_taskloop:
+ case OMPD_taskloop_simd:
+ case OMPD_unknown:
+ llvm_unreachable("Unexpected directive.");
+ }
+ }
+
+ return false;
+}
+
+static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) {
+ OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+ switch (DirectiveKind) {
+ case OMPD_target:
+ case OMPD_target_teams:
+ case OMPD_target_teams_distribute:
+ return hasNestedSPMDDirective(D);
+ case OMPD_target_simd:
+ case OMPD_target_parallel:
+ case OMPD_target_parallel_for:
+ case OMPD_target_parallel_for_simd:
+ case OMPD_target_teams_distribute_simd:
+ case OMPD_target_teams_distribute_parallel_for:
+ case OMPD_target_teams_distribute_parallel_for_simd:
+ return true;
+ case OMPD_parallel:
+ case OMPD_for:
+ case OMPD_parallel_for:
+ case OMPD_parallel_sections:
+ case OMPD_for_simd:
+ case OMPD_parallel_for_simd:
+ case OMPD_cancel:
+ case OMPD_cancellation_point:
+ case OMPD_ordered:
+ case OMPD_threadprivate:
+ case OMPD_task:
+ case OMPD_simd:
+ case OMPD_sections:
+ case OMPD_section:
+ case OMPD_single:
+ case OMPD_master:
+ case OMPD_critical:
+ case OMPD_taskyield:
+ case OMPD_barrier:
+ case OMPD_taskwait:
+ case OMPD_taskgroup:
+ case OMPD_atomic:
+ case OMPD_flush:
+ case OMPD_teams:
+ case OMPD_target_data:
+ case OMPD_target_exit_data:
+ case OMPD_target_enter_data:
+ case OMPD_distribute:
+ case OMPD_distribute_simd:
+ case OMPD_distribute_parallel_for:
+ case OMPD_distribute_parallel_for_simd:
+ case OMPD_teams_distribute:
+ case OMPD_teams_distribute_simd:
+ case OMPD_teams_distribute_parallel_for:
+ case OMPD_teams_distribute_parallel_for_simd:
+ case OMPD_target_update:
+ case OMPD_declare_simd:
+ case OMPD_declare_target:
+ case OMPD_end_declare_target:
+ case OMPD_declare_reduction:
+ case OMPD_taskloop:
+ case OMPD_taskloop_simd:
+ case OMPD_unknown:
+ break;
+ }
+ llvm_unreachable(
+ "Unknown programming model for OpenMP directive on NVPTX target.");
}
-void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
- ExecutionModeRAII ModeRAII(CurrentExecutionMode,
- CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
+ ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false);
EntryFunctionState EST;
WorkerFunctionState WST(CGM, D.getLocStart());
Work.clear();
@@ -613,11 +761,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericKe
: EST(EST), WST(WST) {}
void Enter(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericEntryHeader(CGF, EST, WST);
+ .emitNonSPMDEntryHeader(CGF, EST, WST);
}
void Exit(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericEntryFooter(CGF, EST);
+ .emitNonSPMDEntryFooter(CGF, EST);
}
} Action(EST, WST);
CodeGen.setAction(Action);
@@ -633,7 +781,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKe
}
// Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
EntryFunctionState &EST,
WorkerFunctionState &WST) {
CGBuilderTy &Bld = CGF.Builder;
@@ -657,6 +805,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEn
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
CGF.EmitBlock(MasterBB);
+ IsInTargetMasterThreadRegion = true;
// SEQUENTIAL (MASTER) REGION START
// First action in sequential region:
// Initialize the state of the OpenMP runtime library on the GPU.
@@ -674,12 +823,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEn
emitGenericVarsProlog(CGF, WST.Loc);
}
-void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
EntryFunctionState &EST) {
- emitGenericVarsEpilog(CGF);
+ IsInTargetMasterThreadRegion = false;
if (!CGF.HaveInsertPoint())
return;
+ emitGenericVarsEpilog(CGF);
+
if (!EST.ExitBB)
EST.ExitBB = CGF.createBasicBlock(".exit");
@@ -707,8 +858,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKerne
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
- ExecutionModeRAII ModeRAII(CurrentExecutionMode,
- CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+ ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true);
EntryFunctionState EST;
// Emit target region as a standalone region.
@@ -754,10 +904,17 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntry
CGF.EmitBranch(ExecuteBB);
CGF.EmitBlock(ExecuteBB);
+
+ emitGenericVarsProlog(CGF, D.getLocStart());
}
void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
EntryFunctionState &EST) {
+ if (!CGF.HaveInsertPoint())
+ return;
+
+ emitGenericVarsEpilog(CGF);
+
if (!EST.ExitBB)
EST.ExitBB = CGF.createBasicBlock(".exit");
@@ -781,11 +938,12 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntry
// 'generic', the runtime reserves one warp for the master, otherwise, all
// warps participate in parallel work.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
- CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
- auto *GVMode = new llvm::GlobalVariable(
- CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage,
- llvm::ConstantInt::get(CGM.Int8Ty, Mode), Twine(Name, "_exec_mode"));
+ bool Mode) {
+ auto *GVMode =
+ new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+ llvm::GlobalValue::WeakAnyLinkage,
+ llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
+ Twine(Name, "_exec_mode"));
CGM.addCompilerUsedGlobal(GVMode);
}
@@ -846,8 +1004,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoo
Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
// On termination condition (workid == 0), exit loop.
- llvm::Value *ShouldTerminate =
- Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
+ llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
+ llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
// Activate requested workers.
@@ -886,6 +1044,22 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoo
CGF.EmitBlock(CheckNextBB);
}
+ // Default case: call to outlined function through pointer if the target
+ // region makes a declare target call that may contain an orphaned parallel
+ // directive.
+ auto *ParallelFnTy =
+ llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
+ /*isVarArg=*/false)
+ ->getPointerTo();
+ llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
+ // Insert call to work function via shared wrapper. The shared
+ // wrapper takes two arguments:
+ // - the parallelism level;
+ // - the thread ID;
+ emitCall(CGF, WST.Loc, WorkFnCast,
+ {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
+ // Go to end of parallel region.
+ CGF.EmitBranch(TerminateBB);
// Signal end of parallel region.
CGF.EmitBlock(TerminateBB);
@@ -1163,6 +1337,14 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntime
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
break;
}
+ case OMPRTL_NVPTX__kmpc_parallel_level: {
+ // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
+ llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
+ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
+ break;
+ }
}
return RTLFn;
}
@@ -1198,27 +1380,19 @@ void CGOpenMPRuntimeNVPTX::emitTargetOut
assert(!ParentName.empty() && "Invalid target region parent name!");
- CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
- switch (Mode) {
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
- emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
- break;
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+ bool Mode = supportsSPMDExecutionMode(D);
+ if (Mode)
emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
- break;
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
- llvm_unreachable(
- "Unknown programming model for OpenMP directive on NVPTX target.");
- }
+ else
+ emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
- : CGOpenMPRuntime(CGM, "_", "$"),
- CurrentExecutionMode(ExecutionMode::Unknown) {
+ : CGOpenMPRuntime(CGM, "_", "$") {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP NVPTX can only handle device code.");
}
@@ -1258,23 +1432,32 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitP
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
SourceLocation &Loc;
+ bool &IsInParallelRegion;
+ bool PrevIsInParallelRegion;
public:
- NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
+ NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion)
+ : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {}
void Enter(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsProlog(CGF, Loc);
+ PrevIsInParallelRegion = IsInParallelRegion;
+ IsInParallelRegion = true;
}
void Exit(CodeGenFunction &CGF) override {
+ IsInParallelRegion = PrevIsInParallelRegion;
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF);
}
- } Action(Loc);
+ } Action(Loc, IsInParallelRegion);
CodeGen.setAction(Action);
+ bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
+ IsInTargetMasterThreadRegion = false;
auto *OutlinedFun =
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen));
- if (!isInSpmdExecutionMode()) {
+ IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
+ if (!isInSpmdExecutionMode() && !IsInParallelRegion) {
llvm::Function *WrapperFun =
createParallelDataSharingWrapper(OutlinedFun, D);
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
@@ -1316,6 +1499,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitT
void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
CGBuilderTy &Bld = CGF.Builder;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1402,6 +1588,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVa
}
void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
I->getSecond().MappedParams->restore(CGF);
@@ -1449,31 +1638,61 @@ void CGOpenMPRuntimeNVPTX::emitParallelC
if (isInSpmdExecutionMode())
emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
else
- emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+ emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
}
-void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
+void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
- llvm::Function *WFn = WrapperFunctionsMap[Fn];
-
- assert(WFn && "Wrapper function does not exist!");
// Force inline this outlined function at its call site.
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
- auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF,
- PrePostActionTy &) {
- CGBuilderTy &Bld = CGF.Builder;
+ Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/32, /*Signed=*/1),
+ ".zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+ Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+ auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr](
+ CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+
+ llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+ OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
+ OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+ emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
+ };
+ auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
+ PrePostActionTy &) {
+
+ RegionCodeGenTy RCG(CodeGen);
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadID = getThreadID(CGF, Loc);
+ llvm::Value *Args[] = {RTLoc, ThreadID};
+ NVPTXActionTy Action(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+ Args,
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+ Args);
+ RCG.setAction(Action);
+ RCG(CGF);
+ };
+
+ auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Function *WFn = WrapperFunctionsMap[Fn];
+ assert(WFn && "Wrapper function does not exist!");
llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
// Prepare for parallel region. Indicate the outlined function.
llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
- CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
- OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
- Args);
+ CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
+ Args);
// Create a private scope that will globalize the arguments
// passed from the outside of the target region.
@@ -1496,13 +1715,13 @@ void CGOpenMPRuntimeNVPTX::emitGenericPa
// Store variable address in a list of references to pass to workers.
unsigned Idx = 0;
ASTContext &Ctx = CGF.getContext();
- Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
- Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
- .castAs<PointerType>());
+ Address SharedArgListAddress = CGF.EmitLoadOfPointer(
+ SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
+ .castAs<PointerType>());
for (llvm::Value *V : CapturedVars) {
- Address Dst = Bld.CreateConstInBoundsGEP(
- SharedArgListAddress, Idx, CGF.getPointerSize());
- llvm::Value * PtrV;
+ Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+ CGF.getPointerSize());
+ llvm::Value *PtrV;
if (V->getType()->isIntegerTy())
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
else
@@ -1533,43 +1752,67 @@ void CGOpenMPRuntimeNVPTX::emitGenericPa
Work.emplace_back(WFn);
};
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *ThreadID = getThreadID(CGF, Loc);
- llvm::Value *Args[] = {RTLoc, ThreadID};
-
- auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF,
- PrePostActionTy &) {
- auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF,
- PrePostActionTy &Action) {
- Action.Enter(CGF);
-
- llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
- Address ZeroAddr =
- CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
- /*DestWidth=*/32, /*Signed=*/1),
- ".zero.addr");
- CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
- OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
- OutlinedFnArgs.push_back(ZeroAddr.getPointer());
- OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
- emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
- };
-
+ auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen](
+ CodeGenFunction &CGF, PrePostActionTy &Action) {
RegionCodeGenTy RCG(CodeGen);
- NVPTXActionTy Action(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
- Args,
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
- Args);
- RCG.setAction(Action);
- RCG(CGF);
+ if (IsInParallelRegion) {
+ SeqGen(CGF, Action);
+ } else if (IsInTargetMasterThreadRegion) {
+ L0ParallelGen(CGF, Action);
+ } else {
+ // Check for master and then parallelism:
+ // if (is_master) {
+ // Worker call.
+ // } else if (__kmpc_parallel_level(loc, gtid)) {
+ // Serialized execution.
+ // } else {
+ // Outlined function call.
+ // }
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
+ if (!isInSpmdExecutionMode()) {
+ llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+ llvm::BasicBlock *ParallelCheckBB =
+ CGF.createBasicBlock(".parallelcheck");
+ llvm::Value *IsMaster =
+ Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+ Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
+ CGF.EmitBlock(MasterCheckBB);
+ L0ParallelGen(CGF, Action);
+ CGF.EmitBranch(ExitBB);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ParallelCheckBB);
+ }
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadID = getThreadID(CGF, Loc);
+ llvm::Value *PL = CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
+ {RTLoc, ThreadID});
+ llvm::Value *Res = Bld.CreateIsNotNull(PL);
+ llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then");
+ llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
+ Bld.CreateCondBr(Res, ThenBlock, ElseBlock);
+ // Emit the 'then' code.
+ CGF.EmitBlock(ThenBlock);
+ SeqGen(CGF, Action);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ // Emit the 'else' code.
+ CGF.EmitBlock(ElseBlock);
+ RCG(CGF);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ // Emit the continuation block for code after the if.
+ CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+ }
};
if (IfCond) {
- emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
+ emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
} else {
CodeGenFunction::RunCleanupsScope Scope(CGF);
- RegionCodeGenTy ThenRCG(L0ParallelGen);
+ RegionCodeGenTy ThenRCG(LNParallelGen);
ThenRCG(CGF);
}
}
@@ -3090,6 +3333,9 @@ llvm::Function *CGOpenMPRuntimeNVPTX::cr
void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
const Decl *D) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
@@ -3143,6 +3389,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionP
Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return Address::invalid();
+
VD = VD->getCanonicalDecl();
auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h Mon May 7 07:50:05 2018
@@ -25,7 +25,7 @@ namespace CodeGen {
class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
private:
- // Parallel outlined function work for workers to execute.
+ /// Parallel outlined function work for workers to execute.
llvm::SmallVector<llvm::Function *, 16> Work;
struct EntryFunctionState {
@@ -52,14 +52,14 @@ private:
/// \brief Helper for worker function. Emit body of worker loop.
void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
- /// \brief Helper for generic target entry function. Guide the master and
+ /// \brief Helper for non-SPMD target entry function. Guide the master and
/// worker threads to their respective locations.
- void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+ void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
WorkerFunctionState &WST);
- /// \brief Signal termination of OMP execution for generic target entry
+ /// \brief Signal termination of OMP execution for non-SPMD target entry
/// function.
- void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+ void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
/// Helper for generic variables globalization prolog.
void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc);
@@ -93,7 +93,7 @@ private:
/// \param IsOffloadEntry True if the outlined function is an offload entry.
/// An outlined function may not be an entry if, e.g. the if clause always
/// evaluates to false.
- void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName,
+ void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen);
@@ -133,14 +133,14 @@ private:
/// \brief Emits code for parallel or serial call of the \a OutlinedFn with
/// variables captured in a record which address is stored in \a
/// CapturedStruct.
- /// This call is for the Generic Execution Mode.
+ /// This call is for the Non-SPMD Execution Mode.
/// \param OutlinedFn Outlined function to be run in parallel threads. Type of
/// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
/// \param CapturedVars A pointer to the record with the references to
/// variables used in \a OutlinedFn function.
/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.
- void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+ void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond);
@@ -304,15 +304,15 @@ public:
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) override;
- /// Target codegen is specialized based on two programming models: the
- /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
- /// model for constructs like 'target parallel' that support it.
- enum ExecutionMode {
- /// Single Program Multiple Data.
- Spmd,
- /// Generic codegen to support fork-join model.
+ /// Target codegen is specialized based on two data-sharing modes: CUDA, in
+ /// which the local variables are actually global threadlocal, and Generic, in
+ /// which the local variables are placed in global memory if they may escape
+ /// their declaration context.
+ enum DataSharingMode {
+ /// CUDA data sharing mode.
+ CUDA,
+ /// Generic data-sharing mode.
Generic,
- Unknown,
};
/// Cleans up references to the objects in finished function.
@@ -320,11 +320,17 @@ public:
void functionFinished(CodeGenFunction &CGF) override;
private:
- // Track the execution mode when codegening directives within a target
- // region. The appropriate mode (generic/spmd) is set on entry to the
- // target region and used by containing directives such as 'parallel'
- // to emit optimized code.
- ExecutionMode CurrentExecutionMode;
+ /// Track the execution mode when codegening directives within a target
+ /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
+ /// target region and used by containing directives such as 'parallel'
+ /// to emit optimized code.
+ bool IsInSPMDExecutionMode = false;
+
+ /// true if we're emitting the code for the target region and next parallel
+ /// region is L0 for sure.
+ bool IsInTargetMasterThreadRegion = false;
+ /// true if we're definitely in the parallel region.
+ bool IsInParallelRegion = false;
/// Map between an outlined function and its wrapper.
llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original)
+++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Mon May 7 07:50:05 2018
@@ -2399,8 +2399,17 @@ llvm::Constant *CodeGenModule::GetOrCrea
// For the device mark the function as one that should be emitted.
if (getLangOpts().OpenMPIsDevice && OpenMPRuntime &&
!OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() &&
- !DontDefer && !IsForDefinition)
- addDeferredDeclToEmit(GD);
+ !DontDefer && !IsForDefinition) {
+ const FunctionDecl *FDDef = FD->getDefinition();
+ GlobalDecl GDDef;
+ if (const auto *CD = dyn_cast<CXXConstructorDecl>(FDDef))
+ GDDef = GlobalDecl(CD, GD.getCtorType());
+ else if (const auto *DD = dyn_cast<CXXDestructorDecl>(FDDef))
+ GDDef = GlobalDecl(DD, GD.getDtorType());
+ else
+ GDDef = GlobalDecl(FDDef);
+ addDeferredDeclToEmit(GDDef);
+ }
if (FD->isMultiVersion() && FD->getAttr<TargetAttr>()->isDefaultVersion()) {
UpdateMultiVersionNames(GD, FD);
Modified: cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp (original)
+++ cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp Mon May 7 07:50:05 2018
@@ -9,15 +9,17 @@
#define HEADER
// Check that the execution mode of all 6 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l100}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l177}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l287}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l324}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l342}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l307}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l102}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l179}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l289}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l326}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l344}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l309}}_exec_mode = weak constant i8 1
__thread int id;
+int baz(int f);
+
template<typename tx, typename ty>
struct TT{
tx X;
@@ -33,7 +35,7 @@ int foo(int n) {
double cn[5][n];
TT<long long, char> d;
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l100}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l102}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -64,7 +66,7 @@ int foo(int n) {
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l100]]()
+ // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l102]]()
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
@@ -106,7 +108,7 @@ int foo(int n) {
{
}
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l177}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l179}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -137,7 +139,7 @@ int foo(int n) {
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l177]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]])
+ // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l179]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]])
// CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
// CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
// CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
@@ -180,7 +182,7 @@ int foo(int n) {
id = aa;
}
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l287}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l289}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -211,7 +213,7 @@ int foo(int n) {
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l287]](i[[SZ]]
+ // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l289]](i[[SZ]]
// Create local storage for each capture.
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]]
// CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -343,6 +345,7 @@ struct S1 {
{
this->a = (double)b + 1.5;
c[1][1] = ++a;
+ baz(a);
}
return c[1][1] + (int)b;
@@ -364,7 +367,13 @@ int bar(int n){
return a;
}
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+324}}_worker()
+int baz(int f) {
+#pragma omp parallel
+ f = 2;
+ return f;
+}
+
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+326}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -395,7 +404,7 @@ int bar(int n){
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l324]](i[[SZ]]
+ // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l326]](i[[SZ]]
// Create local storage for each capture.
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]]
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -450,9 +459,10 @@ int bar(int n){
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l342}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l344}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+ // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
@@ -469,6 +479,8 @@ int bar(int n){
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
//
// CHECK: [[EXEC_PARALLEL]]
+ // CHECK: [[WORK_FN:%.+]] = bitcast i8* [[WORK]] to void (i16, i32)*
+ // CHECK: call void [[WORK_FN]](i16 0, i32 [[GTID]])
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[TERM_PARALLEL]]
@@ -481,7 +493,7 @@ int bar(int n){
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l342]](
+ // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l344]](
// Create local storage for each capture.
// CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
// CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -528,6 +540,7 @@ int bar(int n){
// CHECK-64-DAG:load i32, i32* [[REF_B]]
// CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
// CHECK-DAG: getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
+ // CHECK: call i32 [[BAZ:@.*baz.*]](i32 %
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
// CHECK: [[TERMINATE]]
@@ -538,9 +551,48 @@ int bar(int n){
// CHECK: [[EXIT]]
// CHECK: ret void
+ // CHECK: define i32 [[BAZ]](i32 [[F:%.*]])
+ // CHECK: [[ZERO_ADDR:%.+]] = alloca i32,
+ // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
+ // CHECK: [[GTID_ADDR:%.+]] = alloca i32,
+ // CHECK: store i32 0, i32* [[ZERO_ADDR]]
+ // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0)
+ // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
+ // CHECK: [[F_PTR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[REC_ADDR]], i32 0, i32 0
+ // CHECK: store i32 [[F]], i32* [[F_PTR]],
+ // CHECK: store i32 [[GTID]], i32* [[GTID_ADDR]],
+ // CHECK: icmp eq i32
+ // CHECK: br i1
+
+ // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*), i16 1)
+ // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 1)
+ // CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]],
+ // CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0
+ // CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8*
+ // CHECK: store i8* [[F_REF]], i8** [[REF]],
+ // CHECK: call void @llvm.nvvm.barrier0()
+ // CHECK: call void @llvm.nvvm.barrier0()
+ // CHECK: call void @__kmpc_end_sharing_variables()
+ // CHECK: br label
+
+ // CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+ // CHECK: icmp ne i16 [[RES]], 0
+ // CHECK: br i1
+
+ // CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+ // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]])
+ // CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+ // CHECK: br label
+
+ // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]])
+ // CHECK: br label
+
+ // CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]],
+ // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]])
+ // CHECK: ret i32 [[RES]]
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l307}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l309}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -571,7 +623,7 @@ int bar(int n){
// CHECK: [[EXIT]]
// CHECK: ret void
- // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l307]](i[[SZ]]
+ // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l309]](i[[SZ]]
// Create local storage for each capture.
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]]
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]]
Modified: cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp?rev=331642&r1=331641&r2=331642&view=diff
==============================================================================
--- cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp (original)
+++ cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp Mon May 7 07:50:05 2018
@@ -19,45 +19,20 @@ int main(int argc, char **argv) {
return 0;
}
-// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker()
-// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
-// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]])
+// CHECK: @__omp_offloading_{{.*}}_main_l16_exec_mode = weak constant i8 0
-// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
+// CHECK: define void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
-// CHECK: call void @__kmpc_kernel_init(
-// CHECK: call void @__kmpc_data_sharing_init_stack()
+// CHECK: call void @__kmpc_spmd_kernel_init(
// CHECK: call void @__kmpc_for_static_init_4(
-// CHECK: call void @__kmpc_kernel_prepare_parallel(
-// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4)
-// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
-// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8*
-// CHECK: store i8* [[LB]], i8** [[BUF_PTR]],
-// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
-// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8*
-// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]],
-// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
-// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8*
-// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]],
-// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
-// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8*
-// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]],
-// CHECK: call void @llvm.nvvm.barrier0()
-// CHECK: call void @llvm.nvvm.barrier0()
-// CHECK: call void @__kmpc_end_sharing_variables()
-// CHECK: br label
-
-// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @
-// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** %
-// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]])
-// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @
+
+// CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}})
// CHECK: br label %
// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
-// CHECK: call void @__kmpc_kernel_deinit(i16 1)
-// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}})
// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0)
@@ -75,24 +50,4 @@ int main(int argc, char **argv) {
// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
-// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32)
-// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]])
-// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
-// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0
-// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}*
-// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]],
-// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
-// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}*
-// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]],
-// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
-// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32*
-// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]],
-// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64
-// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
-// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32**
-// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]],
-// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]])
-// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]])
-// CHECK: ret void
-
#endif
More information about the cfe-commits
mailing list