[clang] 1d5711c - [OpenMP] Unified entry point for SPMD & generic kernels in the device RTL

Sat Jul 10 10:33:36 PDT 2021

Author: Johannes Doerfert
Date: 2021-07-10T12:32:50-05:00
New Revision: 1d5711c3eeb62098b46d4d383f2e849b9756105d

URL: https://github.com/llvm/llvm-project/commit/1d5711c3eeb62098b46d4d383f2e849b9756105d
DIFF: https://github.com/llvm/llvm-project/commit/1d5711c3eeb62098b46d4d383f2e849b9756105d.diff

LOG: [OpenMP] Unified entry point for SPMD & generic kernels in the device RTL

In the spirit of TRegions [0], this patch provides a simpler and uniform
interface for a kernel to set up the device runtime. The OMPIRBuilder is
used for reuse in Flang. A custom state machine will be generated in the
follow up patch.

The "surplus" threads of the "master warp" will not exit early anymore
so we need to use non-aligned barriers. The new runtime will not have an
extra warp but also require these non-aligned barriers.

[0] https://link.springer.com/chapter/10.1007/978-3-030-28596-8_11

This was in parts extracted from D59319.

Reviewed By: ABataev, JonChesterfield

Differential Revision: https://reviews.llvm.org/D101976

Added: 
    openmp/libomptarget/deviceRTLs/common/include/target.h

Modified: 
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
    llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
    llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/replace_globalization.ll
    llvm/test/Transforms/OpenMP/single_threaded_execution.ll
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/common/src/parallel.cu
    openmp/libomptarget/deviceRTLs/interface.h
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 965b3f1534d67..1cb367ec71885 100644

--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -553,63 +553,6 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
                        "nvptx_lane_id");
 }
 
-/// Get the value of the thread_limit clause in the teams directive.
-/// For the 'generic' execution mode, the runtime encodes thread_limit in
-/// the launch parameters, always starting thread_limit+warpSize threads per
-/// CTA. The threads in the last warp are reserved for master execution.
-/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
-static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
-                                   bool IsInSPMDExecutionMode = false) {
-  CGBuilderTy &Bld = CGF.Builder;
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  llvm::Value *ThreadLimit = nullptr;
-  if (IsInSPMDExecutionMode)
-    ThreadLimit = RT.getGPUNumThreads(CGF);
-  else {
-    llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
-    llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
-    ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
-  }
-  assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
-  return ThreadLimit;
-}
-
-/// Get the thread id of the OMP master thread.
-/// The master thread id is the first thread (lane) of the last warp in the
-/// GPU block.  Warp size is assumed to be some power of 2.
-/// Thread id is 0 indexed.
-/// E.g: If NumThreads is 33, master id is 32.
-///      If NumThreads is 64, master id is 32.
-///      If NumThreads is 1024, master id is 992.
-static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
-  // We assume that the warp size is a power of 2.
-  llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
-
-  llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
-  return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
-}
-
-CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
-    CodeGenModule &CGM, SourceLocation Loc)
-    : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
-      Loc(Loc) {
-  createWorkerFunction(CGM);
-}
-
-void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
-    CodeGenModule &CGM) {
-  // Create an worker function with no arguments.
-
-  WorkerFn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      /*placeholder=*/"_worker", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
-  WorkerFn->setDoesNotRecurse();
-}
-
 CGOpenMPRuntimeGPU::ExecutionMode
 CGOpenMPRuntimeGPU::getExecutionMode() const {
   return CurrentExecutionMode;
@@ -1073,23 +1016,19 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
                                              const RegionCodeGenTy &CodeGen) {
   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
   EntryFunctionState EST;
-  WorkerFunctionState WST(CGM, D.getBeginLoc());
-  Work.clear();
   WrapperFunctionsMap.clear();
 
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
-    CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
 
   public:
-    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
-                         CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
-        : EST(EST), WST(WST) {}
+    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
+        : EST(EST) {}
     void Enter(CodeGenFunction &CGF) override {
       auto &RT =
           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-      RT.emitNonSPMDEntryHeader(CGF, EST, WST);
+      RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
@@ -1097,93 +1036,33 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
       auto &RT =
           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
       RT.clearLocThreadIdInsertPt(CGF);
-      RT.emitNonSPMDEntryFooter(CGF, EST);
+      RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
     }
-  } Action(EST, WST);
+  } Action(EST);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
   IsInTTDRegion = false;
-
-  // Now change the name of the worker function to correspond to this target
-  // region's entry function.
-  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
-
-  // Create the worker function
-  emitWorkerFunction(WST);
 }
 
-// Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
-                                                  EntryFunctionState &EST,
-                                                  WorkerFunctionState &WST) {
+void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
+                                        EntryFunctionState &EST, bool IsSPMD) {
   CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
-  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
-  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
-  EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
-  llvm::Value *ThreadLimit = getThreadLimit(CGF);
-  llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
-  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
-
-  CGF.EmitBlock(WorkerBB);
-  emitCall(CGF, WST.Loc, WST.WorkerFn);
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(MasterCheckBB);
-  GPUThreadID = RT.getGPUThreadID(CGF);
-  llvm::Value *MasterThreadID = getMasterThreadID(CGF);
-  llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
-  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
-
-  CGF.EmitBlock(MasterBB);
-  IsInTargetMasterThreadRegion = true;
-  // SEQUENTIAL (MASTER) REGION START
-  // First action in sequential region:
-  // Initialize the state of the OpenMP runtime library on the GPU.
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {getThreadLimit(CGF),
-                         Bld.getInt16(/*RequiresOMPRuntime=*/1)};
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), OMPRTL___kmpc_kernel_init),
-                      Args);
-
-  emitGenericVarsProlog(CGF, WST.Loc);
+  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
+  IsInTargetMasterThreadRegion = IsSPMD;
+  if (!IsSPMD)
+    emitGenericVarsProlog(CGF, EST.Loc);
 }
 
-void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
-                                                  EntryFunctionState &EST) {
-  IsInTargetMasterThreadRegion = false;
-  if (!CGF.HaveInsertPoint())
-    return;
-
-  emitGenericVarsEpilog(CGF);
-
-  if (!EST.ExitBB)
-    EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
-  CGF.EmitBranch(TerminateBB);
+void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
+                                          EntryFunctionState &EST,
+                                          bool IsSPMD) {
+  if (!IsSPMD)
+    emitGenericVarsEpilog(CGF);
 
-  CGF.EmitBlock(TerminateBB);
-  // Signal termination condition.
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
-                      Args);
-  // Barrier to terminate worker threads.
-  syncCTAThreads(CGF);
-  // Master thread jumps to exit point.
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(EST.ExitBB);
-  EST.ExitBB = nullptr;
+  CGBuilderTy &Bld = CGF.Builder;
+  OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
 }
 
 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -1202,23 +1081,21 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
   class NVPTXPrePostActionTy : public PrePostActionTy {
     CGOpenMPRuntimeGPU &RT;
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
-    const OMPExecutableDirective &D;
 
   public:
     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
-                         CGOpenMPRuntimeGPU::EntryFunctionState &EST,
-                         const OMPExecutableDirective &D)
-        : RT(RT), EST(EST), D(D) {}
+                         CGOpenMPRuntimeGPU::EntryFunctionState &EST)
+        : RT(RT), EST(EST) {}
     void Enter(CodeGenFunction &CGF) override {
-      RT.emitSPMDEntryHeader(CGF, EST, D);
+      RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
     void Exit(CodeGenFunction &CGF) override {
       RT.clearLocThreadIdInsertPt(CGF);
-      RT.emitSPMDEntryFooter(CGF, EST);
+      RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
     }
-  } Action(*this, EST, D);
+  } Action(*this, EST);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -1226,54 +1103,6 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
   IsInTTDRegion = false;
 }
 
-void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
-    CodeGenFunction &CGF, EntryFunctionState &EST,
-    const OMPExecutableDirective &D) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  // Setup BBs in entry function.
-  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
-  EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
-                         /*RequiresOMPRuntime=*/
-                         Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
-                      Args);
-
-  CGF.EmitBranch(ExecuteBB);
-
-  CGF.EmitBlock(ExecuteBB);
-
-  IsInTargetMasterThreadRegion = true;
-}
-
-void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
-                                               EntryFunctionState &EST) {
-  IsInTargetMasterThreadRegion = false;
-  if (!CGF.HaveInsertPoint())
-    return;
-
-  if (!EST.ExitBB)
-    EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
-  CGF.EmitBranch(OMPDeInitBB);
-
-  CGF.EmitBlock(OMPDeInitBB);
-  // DeInitialize the OMP state in the runtime; called by all active threads.
-  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
-                         CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
-                      Args);
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(EST.ExitBB);
-  EST.ExitBB = nullptr;
-}
-
 // Create a unique global variable to indicate the execution mode of this target
 // region. The execution mode is either 'generic', or 'spmd' depending on the
 // target directive. This variable is picked up by the offload library to setup
@@ -1290,137 +1119,6 @@ static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
   CGM.addCompilerUsedGlobal(GVMode);
 }
 
-void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
-  ASTContext &Ctx = CGM.getContext();
-
-  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
-  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
-                    WST.Loc, WST.Loc);
-  emitWorkerLoop(CGF, WST);
-  CGF.FinishFunction();
-}
-
-void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
-                                        WorkerFunctionState &WST) {
-  //
-  // The workers enter this loop and wait for parallel work from the master.
-  // When the master encounters a parallel region it sets up the work + variable
-  // arguments, and wakes up the workers.  The workers first check to see if
-  // they are required for the parallel region, i.e., within the # of requested
-  // parallel threads.  The activated workers load the variable arguments and
-  // execute the parallel work.
-  //
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
-  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
-  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
-  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
-  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-
-  CGF.EmitBranch(AwaitBB);
-
-  // Workers wait for work from master.
-  CGF.EmitBlock(AwaitBB);
-  // Wait for parallel work
-  syncCTAThreads(CGF);
-
-  Address WorkFn =
-      CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
-  Address ExecStatus =
-      CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
-  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
-  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
-
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {WorkFn.getPointer()};
-  llvm::Value *Ret =
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
-                          Args);
-  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
-
-  // On termination condition (workid == 0), exit loop.
-  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
-  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
-  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
-
-  // Activate requested workers.
-  CGF.EmitBlock(SelectWorkersBB);
-  llvm::Value *IsActive =
-      Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
-  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
-
-  // Signal start of parallel region.
-  CGF.EmitBlock(ExecuteBB);
-  // Skip initialization.
-  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
-
-  // Process work items: outlined parallel functions.
-  for (llvm::Function *W : Work) {
-    // Try to match this outlined function.
-    llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
-
-    llvm::Value *WorkFnMatch =
-        Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
-
-    llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
-    llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
-    Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
-
-    // Execute this outlined function.
-    CGF.EmitBlock(ExecuteFNBB);
-
-    // Insert call to work function via shared wrapper. The shared
-    // wrapper takes two arguments:
-    //   - the parallelism level;
-    //   - the thread ID;
-    emitCall(CGF, WST.Loc, W,
-             {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
-
-    // Go to end of parallel region.
-    CGF.EmitBranch(TerminateBB);
-
-    CGF.EmitBlock(CheckNextBB);
-  }
-  // Default case: call to outlined function through pointer if the target
-  // region makes a declare target call that may contain an orphaned parallel
-  // directive.
-  auto *ParallelFnTy =
-      llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
-                              /*isVarArg=*/false);
-  llvm::Value *WorkFnCast =
-      Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
-  // Insert call to work function via shared wrapper. The shared
-  // wrapper takes two arguments:
-  //   - the parallelism level;
-  //   - the thread ID;
-  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
-           {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
-  // Go to end of parallel region.
-  CGF.EmitBranch(TerminateBB);
-
-  // Signal end of parallel region.
-  CGF.EmitBlock(TerminateBB);
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
-                      llvm::None);
-  CGF.EmitBranch(BarrierBB);
-
-  // All active and inactive workers wait at a barrier after parallel region.
-  CGF.EmitBlock(BarrierBB);
-  // Barrier after parallel region.
-  syncCTAThreads(CGF);
-  CGF.EmitBranch(AwaitBB);
-
-  // Exit target region.
-  CGF.EmitBlock(ExitBB);
-  // Skip initialization.
-  clearLocThreadIdInsertPt(CGF);
-}
-
 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
                                               llvm::Constant *Addr,
                                               uint64_t Size, int32_t,
@@ -1806,11 +1504,8 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
     CGBuilderTy &Bld = CGF.Builder;
     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
-    if (WFn) {
+    if (WFn)
       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
-      // Remember for post-processing in worker loop.
-      Work.emplace_back(WFn);
-    }
     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
 
     // Create a private scope that will globalize the arguments

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 3decf48cbb932..464af1294b46e 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -38,19 +38,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   llvm::SmallVector<llvm::Function *, 16> Work;
 
   struct EntryFunctionState {
-    llvm::BasicBlock *ExitBB = nullptr;
-  };
-
-  class WorkerFunctionState {
-  public:
-    llvm::Function *WorkerFn;
-    const CGFunctionInfo &CGFI;
     SourceLocation Loc;
-
-    WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc);
-
-  private:
-    void createWorkerFunction(CodeGenModule &CGM);
   };
 
   ExecutionMode getExecutionMode() const;
@@ -60,20 +48,13 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   /// Get barrier to synchronize all threads in a block.
   void syncCTAThreads(CodeGenFunction &CGF);
 
-  /// Emit the worker function for the current target region.
-  void emitWorkerFunction(WorkerFunctionState &WST);
+  /// Helper for target directive initialization.
+  void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
+                      bool IsSPMD);
 
-  /// Helper for worker function. Emit body of worker loop.
-  void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
-
-  /// Helper for non-SPMD target entry function. Guide the master and
-  /// worker threads to their respective locations.
-  void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
-                              WorkerFunctionState &WST);
-
-  /// Signal termination of OMP execution for non-SPMD target entry
-  /// function.
-  void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+  /// Helper for target directive finalization.
+  void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
+                        bool IsSPMD);
 
   /// Helper for generic variables globalization prolog.
   void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc,
@@ -82,13 +63,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   /// Helper for generic variables globalization epilog.
   void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false);
 
-  /// Helper for SPMD mode target directive's entry function.
-  void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
-                           const OMPExecutableDirective &D);
-
-  /// Signal termination of SPMD mode execution.
-  void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
-
   //
   // Base class overrides.
   //

diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 0a249b3e25749..a92c3ba381c67 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -779,6 +779,29 @@ class OpenMPIRBuilder {
                                       llvm::ConstantInt *Size,
                                       const llvm::Twine &Name = Twine(""));
 
+  /// The `omp target` interface
+  ///
+  /// For more information about the usage of this interface,
+  /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
+  ///
+  ///{
+
+  /// Create a runtime call for kmpc_target_init
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
+  /// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
+  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+
+  /// Create a runtime call for kmpc_target_deinit
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
+  /// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
+  void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+
+  ///}
+
   /// Declarations for LLVM-IR types (simple, array, function and structure) are
   /// generated below. Their names are defined and used in OpenMPKinds.def. Here
   /// we provide the declarations, the initializeTypes function will provide the

diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 1804cfeef7b8d..2003f44e34e9c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -409,10 +409,8 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
           /* Int */ Int32, /* kmp_task_t */ VoidPtr)
 
 /// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_kernel_init, false, Void, Int32, Int16)
-__OMP_RTL(__kmpc_kernel_deinit, false, Void, Int16)
-__OMP_RTL(__kmpc_spmd_kernel_init, false, Void, Int32, Int16)
-__OMP_RTL(__kmpc_spmd_kernel_deinit_v2, false, Void, Int16)
+__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1)
+__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 1020de5f30ee9..60d71805c758f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -2191,6 +2192,70 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
   return Builder.CreateCall(Fn, Args);
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
+  ConstantInt *UseGenericStateMachine =
+      ConstantInt::getBool(Int32->getContext(), !IsSPMD);
+  ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
+
+  Function *Fn = getOrCreateRuntimeFunctionPtr(
+      omp::RuntimeFunction::OMPRTL___kmpc_target_init);
+
+  CallInst *ThreadKind =
+      Builder.CreateCall(Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal});
+
+  Value *ExecUserCode = Builder.CreateICmpEQ(
+      ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), "exec_user_code");
+
+  // ThreadKind = __kmpc_target_init(...)
+  // if (ThreadKind == -1)
+  //   user_code
+  // else
+  //   return;
+
+  auto *UI = Builder.CreateUnreachable();
+  BasicBlock *CheckBB = UI->getParent();
+  BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
+
+  BasicBlock *WorkerExitBB = BasicBlock::Create(
+      CheckBB->getContext(), "worker.exit", CheckBB->getParent());
+  Builder.SetInsertPoint(WorkerExitBB);
+  Builder.CreateRetVoid();
+
+  auto *CheckBBTI = CheckBB->getTerminator();
+  Builder.SetInsertPoint(CheckBBTI);
+  Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
+
+  CheckBBTI->eraseFromParent();
+  UI->eraseFromParent();
+
+  // Continue in the "user_code" block, see diagram above and in
+  // openmp/libomptarget/deviceRTLs/common/include/target.h .
+  return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
+}
+
+void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
+                                         bool IsSPMD, bool RequiresFullRuntime) {
+  if (!updateToLocation(Loc))
+    return;
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
+  ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
+
+  Function *Fn = getOrCreateRuntimeFunctionPtr(
+      omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
+
+  Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal});
+}
+
 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
                                                    StringRef FirstSeparator,
                                                    StringRef Separator) {

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 0127f9da43083..b1230b96dd6ae 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -26,9 +26,6 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -37,7 +34,6 @@
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 
-using namespace llvm::PatternMatch;
 using namespace llvm;
 using namespace omp;
 
@@ -2341,10 +2337,12 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
                               AllCallSitesKnown))
     SingleThreadedBBs.erase(&F->getEntryBlock());
 
-  // Check if the edge into the successor block compares a thread-id function to
-  // a constant zero.
-  // TODO: Use AAValueSimplify to simplify and propogate constants.
-  // TODO: Check more than a single use for thread ID's.
+  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+
+  // Check if the edge into the successor block compares the __kmpc_target_init
+  // result with -1. If we are in non-SPMD-mode that signals only the main
+  // thread will execute the edge.
   auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
     if (!Edge || !Edge->isConditional())
       return false;
@@ -2355,31 +2353,20 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
       return false;
 
-    // Temporarily match the pattern generated by clang for teams regions.
-    // TODO: Remove this once the new runtime is in place.
-    ConstantInt *One, *NegOne;
-    CmpInst::Predicate Pred;
-    auto &&m_ThreadID = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_tid_x>();
-    auto &&m_WarpSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_warpsize>();
-    auto &&m_BlockSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_ntid_x>();
-    if (match(Cmp, m_Cmp(Pred, m_ThreadID,
-                         m_And(m_Sub(m_BlockSize, m_ConstantInt(One)),
-                               m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)),
-                                     m_ConstantInt(NegOne))))))
-      if (One->isOne() && NegOne->isMinusOne() &&
-          Pred == CmpInst::Predicate::ICMP_EQ)
-        return true;
-
     ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
-    if (!C || !C->isZero())
+    if (!C)
       return false;
 
-    if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
-      if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
-        return true;
-    if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
-      if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
-        return true;
+    // Match:  -1 == __kmpc_target_init (for non-SPMD kernels only!)
+    if (C->isAllOnesValue()) {
+      auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
+      if (!CB || CB->getCalledFunction() != RFI.Declaration)
+        return false;
+      const int InitIsSPMDArgNo = 1;
+      auto *IsSPMDModeCI =
+          dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo));
+      return IsSPMDModeCI && IsSPMDModeCI->isZero();
+    }
 
     return false;
   };
@@ -2394,7 +2381,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
          PredBB != PredEndBB; ++PredBB) {
       if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
-                              BB))
+                               BB))
         IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
     }
 

diff  --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index cb96fc3832a92..06224e6d4068e 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -3,10 +3,16 @@
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
 @S = external local_unnamed_addr global i8*
+ at 0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8
 
 ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
 ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
+; CHECK-REMARKS-NOT: 6 bytes
+
 ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
 ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
 
@@ -25,14 +31,15 @@ entry:
 define void @bar() {
   call void @baz()
   call void @qux()
+  call void @negative_qux_spmd()
   ret void
 }
 
 ; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED_X]], i32 0, i32 0) to i8*))
 define internal void @baz() {
 entry:
-  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %cmp = icmp eq i32 %tid, 0
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 true)
+  %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %master, label %exit
 master:
   %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !11
@@ -48,20 +55,30 @@ exit:
 ; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*))
 define internal void @qux() {
 entry:
-  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %ntid = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %warpsize = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %0 = sub nuw i32 %warpsize, 1
-  %1 = sub nuw i32 %ntid, 1
-  %2 = xor i32 %0, -1
-  %master_tid = and i32 %1, %2
-  %3 = icmp eq i32 %tid, %master_tid
-  br i1 %3, label %master, label %exit
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true)
+  %0 = icmp eq i32 %call, -1
+  br i1 %0, label %master, label %exit
 master:
   %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12
   %y_on_stack = bitcast i8* %y to [4 x i32]*
-  %4 = bitcast [4 x i32]* %y_on_stack to i8*
-  call void @use(i8* %4)
+  %1 = bitcast [4 x i32]* %y_on_stack to i8*
+  call void @use(i8* %1)
+  call void @__kmpc_free_shared(i8* %y)
+  br label %exit
+exit:
+  ret void
+}
+
+define internal void @negative_qux_spmd() {
+entry:
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 true, i1 true)
+  %0 = icmp eq i32 %call, -1
+  br i1 %0, label %master, label %exit
+master:
+  %y = call i8* @__kmpc_alloc_shared(i64 6), !dbg !12
+  %y_on_stack = bitcast i8* %y to [6 x i32]*
+  %1 = bitcast [6 x i32]* %y_on_stack to i8*
+  call void @use(i8* %1)
   call void @__kmpc_free_shared(i8* %y)
   br label %exit
 exit:
@@ -85,6 +102,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 
 declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}

diff  --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 5fff563d364d8..ae56477902b12 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -3,8 +3,13 @@
 ; REQUIRES: asserts
 ; ModuleID = 'single_threaded_exeuction.c'
 
-define weak void @kernel() {
-  call void @__kmpc_kernel_init(i32 512, i16 1)
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at 0 = private unnamed_addr constant [1 x i8] c"\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8
+
+define void @kernel() {
+  call void @__kmpc_kernel_prepare_parallel(i8* null)
   call void @nvptx()
   call void @amdgcn()
   ret void
@@ -19,8 +24,8 @@ define weak void @kernel() {
 ; Function Attrs: noinline
 define internal void @nvptx() {
 entry:
-  %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %cmp = icmp eq i32 %call, 0
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false)
+  %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
@@ -40,8 +45,8 @@ if.end:
 ; Function Attrs: noinline
 define internal void @amdgcn() {
 entry:
-  %call = call i32 @llvm.amdgcn.workitem.id.x()
-  %cmp = icmp eq i32 %call, 0
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true)
+  %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
@@ -87,7 +92,9 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-declare void @__kmpc_kernel_init(i32, i16)
+declare void @__kmpc_kernel_prepare_parallel(i8*)
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
 
 attributes #0 = { cold noinline }
 

diff  --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h
new file mode 100644
index 0000000000000..997e93b924e24
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/include/target.h
@@ -0,0 +1,94 @@
+//===-- target.h ---------- OpenMP device runtime target implementation ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target region interfaces are simple interfaces designed to allow middle-end
+// (=LLVM) passes to analyze and transform the code. To achieve good performance
+// it may be required to run the associated passes. However, implementations of
+// this interface shall always provide a correct implementation as close to the
+// user expected code as possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
+#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
+
+#include <stdint.h>
+
+extern "C" {
+
+/// Forward declaration of the source location identifier "ident".
+typedef struct ident ident_t;
+
+/// The target region _kernel_ interface for GPUs
+///
+/// This deliberatly simple interface provides the middle-end (=LLVM) with
+/// easier means to reason about the semantic of the code and transform it as
+/// well. The runtime calls are therefore also desiged to carry sufficient
+/// information necessary for optimizations.
+///
+///
+/// Intended usage:
+///
+/// \code
+/// void kernel(...) {
+///   ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false,
+///                                   /* UseGenericStateMachine */ true,
+///                                   /* RequiresFullRuntime */ ... );
+///   if (ThreadKind == -1) {
+///     // User defined kernel code.
+///   }
+///   __kmpc_target_deinit(...);
+/// }
+/// \endcode
+///
+/// Which can be transformed to:
+///
+/// \code
+/// void kernel(...) {
+///   ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false,
+///                                   /* UseGenericStateMachine */ false,
+///                                   /* RequiresFullRuntime */ ... );
+///   if (ThreadKind == -1) {
+///     // User defined kernel code.
+///   } else {
+///     assume(ThreadKind == ThreadId);
+///     // Custom, kernel-specific state machine code.
+///   }
+///   __kmpc_target_deinit(...);
+/// }
+/// \endcode
+///
+///
+///{
+
+/// Initialization
+///
+/// Must be called by all threads.
+///
+/// \param Ident               Source location identification, can be NULL.
+///
+int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine,
+                           bool RequiresFullRuntime);
+
+/// De-Initialization
+///
+/// Must be called by the main thread in generic mode, can be called by all
+/// threads. Must be called by all threads in SPMD mode.
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param Ident Source location identification, can be NULL.
+///
+void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD,
+                          bool RequiresFullRuntime);
+
+///}
+}
+#endif

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index c117c7e00bf28..34af243fab541 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -12,6 +12,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "common/support.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -26,16 +27,18 @@ extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////
 
-EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
+static void __kmpc_generic_kernel_init() {
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
-  ASSERT0(LT_FUSSY, RequiresOMPRuntime,
-          "Generic always requires initialized runtime.");
-  setExecutionParameters(Generic, RuntimeInitialized);
-  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
-    parallelLevel[I] = 0;
+
+  if (GetLaneId() == 0)
+    parallelLevel[GetWarpId()] = 0;
 
   int threadIdInBlock = GetThreadIdInBlock();
+  if (threadIdInBlock != GetMasterThreadID())
+    return;
+
+  setExecutionParameters(Generic, RuntimeInitialized);
   ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
           "__kmpc_kernel_init() must be called by team master warp only!");
   PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
@@ -47,7 +50,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
       omptarget_nvptx_device_State[slot].Dequeue();
 
   // init thread private
-  int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
+  int threadId = 0;
   omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
 
   // init team context
@@ -62,20 +65,17 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   // set number of threads and thread limit in team to started value
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  nThreads = GetNumberOfThreadsInBlock();
-  threadLimit = ThreadLimit;
+  nThreads = GetNumberOfWorkersInTeam();
+  threadLimit = nThreads;
 
-  if (!__kmpc_is_spmd_exec_mode())
-    omptarget_nvptx_globalArgs.Init();
+  omptarget_nvptx_globalArgs.Init();
 
   __kmpc_data_sharing_init_stack();
   __kmpc_impl_target_init();
 }
 
-EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
+static void __kmpc_generic_kernel_deinit() {
   PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
-          "Generic always requires initialized runtime.");
   // Enqueue omp state object for use by another team.
   int slot = usedSlotIdx;
   omptarget_nvptx_device_State[slot].Enqueue(
@@ -84,12 +84,11 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
   omptarget_nvptx_workFn = 0;
 }
 
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
-                                    int16_t RequiresOMPRuntime) {
+static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
   PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
 
-  setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
-                                                  : RuntimeUninitialized);
+  setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized
+                         : RuntimeUninitialized);
   int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
     usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
@@ -100,11 +99,8 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
         1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
   }
   __kmpc_data_sharing_init_stack();
-  if (!RequiresOMPRuntime) {
-    // Runtime is not required - exit.
-    __kmpc_impl_syncthreads();
+  if (!RequiresFullRuntime)
     return;
-  }
 
   //
   // Team Context Initialization.
@@ -138,16 +134,17 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
                                                              newTaskDescr);
 
   // init thread private from init value
+  int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true);
   PRINT(LD_PAR,
         "thread will execute parallel region with id %d in a team of "
         "%d threads\n",
         (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
 }
 
-EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
+static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
   // We're not going to pop the task descr stack of each thread since
   // there are no more parallel regions in SPMD mode.
-  if (!RequiresOMPRuntime)
+  if (!RequiresFullRuntime)
     return;
 
   __kmpc_impl_syncthreads();
@@ -165,4 +162,68 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
   return (execution_param & ModeMask) == Spmd;
 }
 
+EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
+
+static void __kmpc_target_region_state_machine(ident_t *Ident) {
+
+  int TId = GetThreadIdInBlock();
+  do {
+    void* WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    __kmpc_barrier_simple_spmd(Ident, TId);
+
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      ((void(*)(uint32_t,uint32_t))WorkFn)(0, TId);
+      __kmpc_kernel_end_parallel();
+    }
+
+    __kmpc_barrier_simple_spmd(Ident, TId);
+
+  } while (true);
+}
+
+EXTERN
+int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine,
+                           bool RequiresFullRuntime) {
+  int TId = GetThreadIdInBlock();
+  if (IsSPMD)
+    __kmpc_spmd_kernel_init(RequiresFullRuntime);
+  else
+    __kmpc_generic_kernel_init();
+
+   if (IsSPMD) {
+    __kmpc_barrier_simple_spmd(Ident, TId);
+     return -1;
+   }
+
+   if (TId == GetMasterThreadID())
+     return -1;
+
+  if (UseGenericStateMachine)
+    __kmpc_target_region_state_machine(Ident);
+
+  return TId;
+}
+
+EXTERN
+void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD,
+                           bool RequiresFullRuntime) {
+  if (IsSPMD)
+    __kmpc_spmd_kernel_deinit(RequiresFullRuntime);
+  else
+    __kmpc_generic_kernel_deinit();
+}
+
+
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index 29a6db85d172c..ea885b806aca4 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -331,7 +331,7 @@ EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
         (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
 
   // Master signals work to activate workers.
-  __kmpc_barrier_simple_spmd(nullptr, 0);
+  __kmpc_barrier_simple_spmd(ident, 0);
 
   // OpenMP [2.5, Parallel Construct, p.49]
   // There is an implied barrier at the end of a parallel region. After the
@@ -339,7 +339,7 @@ EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
   // execution of the enclosing task region.
   //
   // The master waits at this barrier until all workers are done.
-  __kmpc_barrier_simple_spmd(nullptr, 0);
+  __kmpc_barrier_simple_spmd(ident, 0);
 
   // Decrement parallel level for non-SPMD warps.
   for (int I = 0; I < NumWarps; ++I)

diff  --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 082b6b9d11090..e0c433060c85b 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -416,11 +416,11 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
                              int32_t cancelVal);
 
 // non standard
-EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
-EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
-                                    int16_t RequiresOMPRuntime);
-EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
+EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
+                                 bool UseGenericStateMachine,
+                           bool RequiresFullRuntime);
+EXTERN void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD,
+                           bool RequiresFullRuntime);
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
 EXTERN void __kmpc_kernel_end_parallel();

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index eafa73426a950..35324f070e4d6 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -60,7 +60,13 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return Mask;
 }
 
-EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); }
+EXTERN void __kmpc_impl_syncthreads() { 
+  int barrier = 2;
+  asm volatile("barrier.sync %0;"
+               :
+               : "r"(barrier)
+               : "memory");
+}
 
 EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
   __nvvm_bar_warp_sync(Mask);
@@ -75,7 +81,7 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.
   int barrier = 1;
-  asm volatile("bar.sync %0, %1;"
+  asm volatile("barrier.sync %0, %1;"
                :
                : "r"(barrier), "r"(num_threads)
                : "memory");