[clang] [llvm] [mlir] [OMPIRBuilder] - Handle dependencies in `createTarget` (PR #93977)

Pranav Bhandarkar via cfe-commits cfe-commits at lists.llvm.org
Mon Jul 8 23:09:34 PDT 2024


https://github.com/bhandarkar-pranav updated https://github.com/llvm/llvm-project/pull/93977

>From 8060e0bb038166ead68eb6068e6559325a605c0c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 6 May 2024 23:05:37 -0500
Subject: [PATCH 01/24] Add a flag  to choose new codegen

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f6d12d46cfc07..7b6e93e2122aa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -48,6 +48,10 @@
 using namespace clang;
 using namespace CodeGen;
 using namespace llvm::omp;
+// Experiment to make sanitizers easier to debug
+static llvm::cl::opt<bool> NewClangTargetTaskCodeGen(
+    "new-clang-target-task-codegen", llvm::cl::Optional,
+    llvm::cl::desc("new clang target task codegen."), llvm::cl::init(false));
 
 namespace {
 /// Base class for handling code generation inside OpenMP regions.
@@ -9620,9 +9624,13 @@ static void emitTargetCallKernelLaunch(
         DeviceID, RTLoc, AllocaIP));
   };
 
-  if (RequiresOuterTask)
-    CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
-  else
+  if (RequiresOuterTask) {
+    if (NewClangTargetTaskCodeGen) {
+      llvm::errs() << "Using OMPIRBuilder for target task codegen\n";
+    } else {
+      CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
+    }
+  } else
     OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
 }
 

>From 83f09a5fb05f440d7f9de36d6bb9e693227d66ac Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 13 May 2024 11:21:33 -0500
Subject: [PATCH 02/24] clang prints for debugging

---
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  6 +++-
 clang/lib/CodeGen/CodeGenFunction.h           |  3 ++
 clang/lib/Parse/ParseOpenMP.cpp               | 12 +++++++-
 clang/lib/Sema/SemaOpenMP.cpp                 | 28 ++++++++++++++++++-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  7 ++++-
 5 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 6410f9e102c90..200dd1878a449 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,7 +5043,11 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
   Data.FirstprivateInits.emplace_back(InitRef);
   return OrigVD;
 }
-
+void CodeGenFunction::NewEmitOMPTargetTaskBasedDirective(
+    const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
+    OMPTargetDataInfo &InputInfo) {
+  EmitOMPTargetTaskBasedDirective(S, BodyGen, InputInfo);
+}
 void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
     OMPTargetDataInfo &InputInfo) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 45585361a4fc9..f30666226c4df 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3797,6 +3797,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
                                        const RegionCodeGenTy &BodyGen,
                                        OMPTargetDataInfo &InputInfo);
+  void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
+                                       const RegionCodeGenTy &BodyGen,
+                                       OMPTargetDataInfo &InputInfo);
   void processInReduction(const OMPExecutableDirective &S,
                           OMPTaskDataTy &Data,
                           CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e959dd6378f46..ec07a7d3854af 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2972,11 +2972,19 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       // FIXME: We create a bogus CompoundStmt scope to hold the contents of
       // the captured region. Code elsewhere assumes that any FunctionScopeInfo
       // should have at least one compound statement scope within it.
+      if (AssociatedStmt.get()) {
+        llvm::errs() << __FUNCTION__ << "Loc-1:\n";
+        AssociatedStmt.get()->dump();
+      }
       ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
       {
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
-
+        Stmt * pdb_print = AssociatedStmt.get();
+        if (pdb_print) {
+          llvm::errs() << __FUNCTION__ << "Loc0:\n";
+          pdb_print->dump();
+        }
         if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
             getLangOpts().OpenMPIRBuilder)
           AssociatedStmt =
@@ -2984,6 +2992,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       }
       AssociatedStmt =
           Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+      llvm::errs() << __FUNCTION__ << "Loc1:\n";
+      AssociatedStmt.get()->dump();
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
       Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index bab61e8fd54e8..db882f52b225e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,6 +4854,19 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
 
   SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
   getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
+  llvm::errs() << __FUNCTION__ << ": Loc0:\n";
+  for (OpenMPDirectiveKind c : CaptureRegions) {
+    switch(c) {
+    case OMPD_task:
+      llvm::errs() << "OMPD_task\n";
+      break;
+    case OMPD_target:
+      llvm::errs() << "OMPD_target\n";
+      break;
+    default:
+      llvm::errs() << "default\n";
+    }
+  }
   OMPOrderedClause *OC = nullptr;
   OMPScheduleClause *SC = nullptr;
   SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5005,7 +5018,11 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
     }
     if (++CompletedRegions == CaptureRegions.size())
       DSAStack->setBodyComplete();
+    llvm::errs() << __FUNCTION__ << ": Loc1:\n";
+    SR.get()->dump();
     SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
+    llvm::errs() << __FUNCTION__ << ": Loc2:\n";
+    SR.get()->dump();
   }
   return SR;
 }
@@ -6337,7 +6354,16 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
   llvm::SmallVector<OMPClause *> ClausesWithoutBind;
   bool UseClausesWithoutBind = false;
-
+  if (Kind == Directive::OMPD_target) {
+    if (AStmt) {
+      llvm::errs() << __FUNCTION__ << "***********************\n";
+      AStmt->dump();
+      llvm::errs() << __FUNCTION__ <<  "***PRETTY***\n";
+      AStmt->dumpPretty(getASTContext());
+    } else {
+      llvm::errs() << "__FUNCTION__" << ": AStmt is nullptr\n";
+    }
+  }
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e54ec4f2b1d72..7e414f7406bf4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1518,7 +1518,12 @@ class OpenMPIRBuilder {
   std::forward_list<CanonicalLoopInfo> LoopInfos;
 
   /// Add a new region that will be outlined later.
-  void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
+  void addOutlineInfo(OutlineInfo &&OI) {
+    llvm::errs() << "Adding outline info\n";
+    llvm::errs() << "OI.EntryBB = ";
+    OI.EntryBB->dump();
+    OutlineInfos.emplace_back(OI);
+  }
 
   /// An ordered map of auto-generated variables to their unique names.
   /// It stores variables with the following names: 1) ".gomp_critical_user_" +

>From e2aa768c4de30fd0ed52e96c70e2395a9710a929 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 13 May 2024 16:25:51 -0500
Subject: [PATCH 03/24] add an option -new-ompirbuilder-target-codegen to
 enable dependency-based target codegen path in OMPIRBuilder

---
 clang/lib/CodeGen/CodeGenFunction.h           |  4 +-
 clang/lib/Parse/ParseOpenMP.cpp               |  2 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |  7 ++-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  9 +++
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 57 ++++++++++++++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 61 +++++++++++--------
 6 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f30666226c4df..b80be8ed85458 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3798,8 +3798,8 @@ class CodeGenFunction : public CodeGenTypeCache {
                                        const RegionCodeGenTy &BodyGen,
                                        OMPTargetDataInfo &InputInfo);
   void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
-                                       const RegionCodeGenTy &BodyGen,
-                                       OMPTargetDataInfo &InputInfo);
+                                          const RegionCodeGenTy &BodyGen,
+                                          OMPTargetDataInfo &InputInfo);
   void processInReduction(const OMPExecutableDirective &S,
                           OMPTaskDataTy &Data,
                           CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index ec07a7d3854af..ff6d68f616207 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2980,7 +2980,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       {
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
-        Stmt * pdb_print = AssociatedStmt.get();
+        Stmt *pdb_print = AssociatedStmt.get();
         if (pdb_print) {
           llvm::errs() << __FUNCTION__ << "Loc0:\n";
           pdb_print->dump();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index db882f52b225e..f0a3ec40dee78 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4856,7 +4856,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
   getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
   llvm::errs() << __FUNCTION__ << ": Loc0:\n";
   for (OpenMPDirectiveKind c : CaptureRegions) {
-    switch(c) {
+    switch (c) {
     case OMPD_task:
       llvm::errs() << "OMPD_task\n";
       break;
@@ -6358,10 +6358,11 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     if (AStmt) {
       llvm::errs() << __FUNCTION__ << "***********************\n";
       AStmt->dump();
-      llvm::errs() << __FUNCTION__ <<  "***PRETTY***\n";
+      llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
       AStmt->dumpPretty(getASTContext());
     } else {
-      llvm::errs() << "__FUNCTION__" << ": AStmt is nullptr\n";
+      llvm::errs() << "__FUNCTION__"
+                   << ": AStmt is nullptr\n";
     }
   }
   if (const OMPBindClause *BC =
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 7e414f7406bf4..dd7605cbae6a6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2250,6 +2250,15 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   /// instructions for passed in target arguments where neccessary
+
+  InsertPointTy newCreateTarget(
+      const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+      OpenMPIRBuilder::InsertPointTy CodeGenIP,
+      TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads,
+      SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
+      TargetBodyGenCallbackTy BodyGenCB,
+      TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+      SmallVector<DependData> Dependencies = {});
   InsertPointTy createTarget(const LocationDescription &Loc,
                              OpenMPIRBuilder::InsertPointTy AllocaIP,
                              OpenMPIRBuilder::InsertPointTy CodeGenIP,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cb4de9c8876dc..02106c7316dca 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -73,6 +73,11 @@ static cl::opt<double> UnrollThresholdFactor(
              "simplifications still taking place"),
     cl::init(1.5));
 
+static cl::opt<bool>
+    NewOMPIRBuilderTargetCodegen("new-ompirbuilder-target-codegen", cl::Hidden,
+                                 cl::desc("Use target-task based codegen."),
+                                 cl::init(false));
+
 #ifndef NDEBUG
 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
@@ -5230,12 +5235,13 @@ static void emitTargetOutlinedFunction(
                                       OutlinedFn, OutlinedFnID);
 }
 
-static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
-                           OpenMPIRBuilder::InsertPointTy AllocaIP,
-                           Function *OutlinedFn, Constant *OutlinedFnID,
-                           int32_t NumTeams, int32_t NumThreads,
-                           SmallVectorImpl<Value *> &Args,
-                           OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) {
+static void emitTargetCall(
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+    OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
+    Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
+    SmallVectorImpl<Value *> &Args,
+    OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
+    SmallVector<llvm::OpenMPIRBuilder::DependData> dependencies = {}) {
 
   OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
@@ -5276,12 +5282,49 @@ static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
                                           NumTeamsVal, NumThreadsVal,
                                           DynCGGroupMem, HasNoWait);
-
+  // PDB: here you'll have to break the logic down to do the following
+  // if (!requiresoutertask) {
+  //    Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+  //       Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+  //       DeviceID, RTLoc, AllocaIP));
+  // else {
+  //     create task
+  //     make task call emitkernellaunch.
+  //     make task call
+  // }
+  //
   Builder.restoreIP(OMPBuilder.emitKernelLaunch(
       Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
       DeviceID, RTLoc, AllocaIP));
 }
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
+    int32_t NumThreads, SmallVectorImpl<Value *> &Args,
+    GenMapInfoCallbackTy GenMapInfoCB,
+    OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
+    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+    SmallVector<DependData> Dependencies) {
+  if (!NewOMPIRBuilderTargetCodegen) {
+    llvm::errs() << "Old OpenMPIRBuilder target codegen\n";
+    return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
+                        NumThreads, Args, GenMapInfoCB, CBFunc,
+                        ArgAccessorFuncCB);
+  }
+  llvm::errs() << "New OpenMPIRBuilder target codegen\n";
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
 
+  Builder.restoreIP(CodeGenIP);
+  Function *OutlinedFn;
+  Constant *OutlinedFnID;
+  emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
+                             OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+  if (!Config.isTargetDevice())
+    emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
+                   NumThreads, Args, GenMapInfoCB, Dependencies);
+  return Builder.saveIP();
+}
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6ec4c120c11ea..2fd3aef44ebd5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -681,7 +681,30 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
       ompLoc, bodyCB, numTeamsLower, numTeamsUpper, threadLimit, ifExpr));
   return bodyGenStatus;
 }
-
+static void
+buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+                LLVM::ModuleTranslation &moduleTranslation,
+                SmallVector<llvm::OpenMPIRBuilder::DependData> &dds) {
+  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+    llvm::omp::RTLDependenceKindTy type;
+    switch (
+        cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+    case mlir::omp::ClauseTaskDepend::taskdependin:
+      type = llvm::omp::RTLDependenceKindTy::DepIn;
+      break;
+    // The OpenMP runtime requires that the codegen for 'depend' clause for
+    // 'out' dependency kind must be the same as codegen for 'depend' clause
+    // with 'inout' dependency.
+    case mlir::omp::ClauseTaskDepend::taskdependout:
+    case mlir::omp::ClauseTaskDepend::taskdependinout:
+      type = llvm::omp::RTLDependenceKindTy::DepInOut;
+      break;
+    };
+    llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+    llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+    dds.emplace_back(dd);
+  }
+}
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -705,28 +728,10 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  if (!taskOp.getDependVars().empty() && taskOp.getDepends()) {
-    for (auto dep :
-         llvm::zip(taskOp.getDependVars(), taskOp.getDepends()->getValue())) {
-      llvm::omp::RTLDependenceKindTy type;
-      switch (
-          cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
-      case mlir::omp::ClauseTaskDepend::taskdependin:
-        type = llvm::omp::RTLDependenceKindTy::DepIn;
-        break;
-      // The OpenMP runtime requires that the codegen for 'depend' clause for
-      // 'out' dependency kind must be the same as codegen for 'depend' clause
-      // with 'inout' dependency.
-      case mlir::omp::ClauseTaskDepend::taskdependout:
-      case mlir::omp::ClauseTaskDepend::taskdependinout:
-        type = llvm::omp::RTLDependenceKindTy::DepInOut;
-        break;
-      };
-      llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
-      llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
-      dds.emplace_back(dd);
-    }
-  }
+  if (!taskOp.getDependVars().empty() && taskOp.getDepends())
+    buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+                    moduleTranslation, dds);
+  llvm::errs() << "# Dependencies in task op = " << dds.size() << "\n";
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
@@ -3088,10 +3093,16 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     if (!mapData.IsDeclareTarget[i] && !mapData.IsAMember[i])
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
+  SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
+  if (!targetOp.getDependVars().empty() && targetOp.getDepends())
+    buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+                    moduleTranslation, dds);
+  llvm::errs() << "# Dependencies in target op = " << dds.size() << "\n";
 
-  builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
+  builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
-      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
+      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
+      dds));
 
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary

>From e15bb967c1dde4ea0efc75d8da33be173589551b Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 16 May 2024 17:12:13 -0500
Subject: [PATCH 04/24] checkpoint commit -> able to create an inlined version
 of the task that offloads

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 25 ++++-
 clang/lib/CodeGen/CGStmt.cpp                  | 27 ++++++
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  2 +
 clang/lib/Parse/ParseOpenMP.cpp               | 14 +--
 clang/lib/Sema/SemaOpenMP.cpp                 | 56 +++++------
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  6 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 93 +++++++++++++++++--
 7 files changed, 178 insertions(+), 45 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 7b6e93e2122aa..f56c878b45df8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1320,10 +1320,18 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
     HasCancel = TD->hasCancel();
 
   CodeGenFunction CGF(CGM, true);
+  // llvm::errs() << "LLVMDEBUG::Before CGInfo\n";
+  // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
                                         InnermostKind, HasCancel, Action);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
+  // llvm::errs() << "LLVMDEBUG::Before GenerateCapturedStmt\n";
+  // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   llvm::Function *Res = CGF.GenerateCapturedStmtFunction(*CS);
+  llvm::errs() << "LLVMDEBUG::After GenerateCapturedStmt\n";
+  llvm::errs() << "LLVMDEBUG::CapturedStmt is \n";
+  CS->dump();
+  CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   if (!Tied)
     NumberOfParts = Action.getNumberOfParts();
   return Res;
@@ -3707,7 +3715,16 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
       KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
       TaskPrivatesMap);
 
-  // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
+  llvm::errs() << "LLVMDEBUG::Proxy task function is \n";
+  TaskEntry->dump();
+  llvm::errs() << "LLVMDEBUG::CGF.Builder.GetInsertBlock() after emitting "
+                  "proxy task function is \n";
+  CGF.Builder.GetInsertBlock()->dump();
+  llvm::errs() << "LLVMDEBUG::SharedsTy is \n";
+  CharUnits cu = C.getTypeSizeInChars(SharedsTy);
+  llvm::errs() << "LLVMDEBUG::sizeof(SharedsTy) = \n";
+  llvm::errs() << cu.getQuantity() << "\n";
+  // build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
   // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
   // kmp_routine_entry_t *task_entry);
   // Task flags. Format is taken from
@@ -9549,9 +9566,15 @@ static void emitTargetCallKernelLaunch(
   emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
   bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
                    llvm::codegenoptions::NoDebugInfo;
+  llvm::errs() << "LLVMDEBUG::After emitOffloadingArrays in "
+                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+  OMPBuilder.Builder.GetInsertBlock()->dump();
   OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
                                           EmitDebug,
                                           /*ForEndCall=*/false);
+  llvm::errs() << "LLVMDEBUG::After emitOffloadingArraysArgument in "
+                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+  OMPBuilder.Builder.GetInsertBlock()->dump();
 
   InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
   InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 99daaa14cf3fe..26baad23b87c5 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -3135,6 +3135,12 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   const RecordDecl *RD = S.getCapturedRecordDecl();
   SourceLocation Loc = S.getBeginLoc();
   assert(CD->hasBody() && "missing CapturedDecl body");
+  llvm::errs() << "LLVMDEBUG:: In GenerateCapturedStmtFunction\n";
+  if (Builder.GetInsertBlock()) {
+    llvm::errs()
+        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, InsertBlock is \n";
+    Builder.GetInsertBlock()->dump();
+  }
 
   // Build the argument list.
   ASTContext &Ctx = CGM.getContext();
@@ -3156,6 +3162,13 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   // Generate the function.
   StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(),
                 CD->getBody()->getBeginLoc());
+  llvm::errs()
+      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After StartFunction\n";
+  if (Builder.GetInsertBlock()) {
+    llvm::errs()
+        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+    Builder.GetInsertBlock()->getParent()->dump();
+  }
   // Set the context parameter in CapturedStmtInfo.
   Address DeclPtr = GetAddrOfLocalVar(CD->getContextParam());
   CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
@@ -3181,7 +3194,21 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   }
 
   PGO.assignRegionCounters(GlobalDecl(CD), F);
+  llvm::errs()
+      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: Before EmitBody\n";
+  if (Builder.GetInsertBlock()) {
+    llvm::errs()
+        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+    Builder.GetInsertBlock()->getParent()->dump();
+  }
   CapturedStmtInfo->EmitBody(*this, CD->getBody());
+  llvm::errs()
+      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After EmitBody\n";
+  if (Builder.GetInsertBlock()) {
+    llvm::errs()
+        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+    Builder.GetInsertBlock()->getParent()->dump();
+  }
   FinishFunction(CD->getBodyRBrace());
 
   return F;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 200dd1878a449..1cd3e72c38cc0 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5187,6 +5187,8 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
   llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
       S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, /*Tied=*/true,
       Data.NumberOfParts);
+  llvm::errs() << "LLVMDEBUG::Outlined Task Fn is \n";
+  OutlinedFn->dump();
   llvm::APInt TrueOrFalse(32, S.hasClausesOfKind<OMPNowaitClause>() ? 1 : 0);
   IntegerLiteral IfCond(getContext(), TrueOrFalse,
                         getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index ff6d68f616207..547dd8fcf4552 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2980,11 +2980,11 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       {
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
-        Stmt *pdb_print = AssociatedStmt.get();
-        if (pdb_print) {
-          llvm::errs() << __FUNCTION__ << "Loc0:\n";
-          pdb_print->dump();
-        }
+        // Stmt *pdb_print = AssociatedStmt.get();
+        // if (pdb_print) {
+        //   llvm::errs() << __FUNCTION__ << "Loc0:\n";
+        //   pdb_print->dump();
+        // }
         if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
             getLangOpts().OpenMPIRBuilder)
           AssociatedStmt =
@@ -2992,8 +2992,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       }
       AssociatedStmt =
           Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
-      llvm::errs() << __FUNCTION__ << "Loc1:\n";
-      AssociatedStmt.get()->dump();
+      // llvm::errs() << __FUNCTION__ << "Loc1:\n";
+      // AssociatedStmt.get()->dump();
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
       Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index f0a3ec40dee78..211b93a171dfe 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,19 +4854,19 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
 
   SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
   getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
-  llvm::errs() << __FUNCTION__ << ": Loc0:\n";
-  for (OpenMPDirectiveKind c : CaptureRegions) {
-    switch (c) {
-    case OMPD_task:
-      llvm::errs() << "OMPD_task\n";
-      break;
-    case OMPD_target:
-      llvm::errs() << "OMPD_target\n";
-      break;
-    default:
-      llvm::errs() << "default\n";
-    }
-  }
+  // llvm::errs() << __FUNCTION__ << ": Loc0:\n";
+  // for (OpenMPDirectiveKind c : CaptureRegions) {
+  //   switch (c) {
+  //   case OMPD_task:
+  //     llvm::errs() << "OMPD_task\n";
+  //     break;
+  //   case OMPD_target:
+  //     llvm::errs() << "OMPD_target\n";
+  //     break;
+  //   default:
+  //     llvm::errs() << "default\n";
+  //   }
+  // }
   OMPOrderedClause *OC = nullptr;
   OMPScheduleClause *SC = nullptr;
   SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5018,11 +5018,11 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
     }
     if (++CompletedRegions == CaptureRegions.size())
       DSAStack->setBodyComplete();
-    llvm::errs() << __FUNCTION__ << ": Loc1:\n";
-    SR.get()->dump();
+    // llvm::errs() << __FUNCTION__ << ": Loc1:\n";
+    // SR.get()->dump();
     SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
-    llvm::errs() << __FUNCTION__ << ": Loc2:\n";
-    SR.get()->dump();
+    // llvm::errs() << __FUNCTION__ << ": Loc2:\n";
+    // SR.get()->dump();
   }
   return SR;
 }
@@ -6354,17 +6354,17 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
   llvm::SmallVector<OMPClause *> ClausesWithoutBind;
   bool UseClausesWithoutBind = false;
-  if (Kind == Directive::OMPD_target) {
-    if (AStmt) {
-      llvm::errs() << __FUNCTION__ << "***********************\n";
-      AStmt->dump();
-      llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
-      AStmt->dumpPretty(getASTContext());
-    } else {
-      llvm::errs() << "__FUNCTION__"
-                   << ": AStmt is nullptr\n";
-    }
-  }
+  // if (Kind == Directive::OMPD_target) {
+  //   if (AStmt) {
+  //     llvm::errs() << __FUNCTION__ << "***********************\n";
+  //     AStmt->dump();
+  //     llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
+  //     AStmt->dumpPretty(getASTContext());
+  //   } else {
+  //     llvm::errs() << "__FUNCTION__"
+  //                  << ": AStmt is nullptr\n";
+  //   }
+  // }
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index dd7605cbae6a6..10a85e72ae7dc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1766,7 +1766,11 @@ class OpenMPIRBuilder {
       const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
-
+  InsertPointTy emitTargetTask(IRBuilderBase &Builder, Function *OutlinedFn,
+                               Value *OutlinedFnID,
+                               EmitFallbackCallbackTy EmitTargetCallFallbackCB,
+                               TargetKernelArgs &Args, Value *DeviceID,
+                               Value *RTLoc);
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02106c7316dca..e02794c255ca0 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1049,6 +1049,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
       Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
   Builder.restoreIP(Loc.IP);
 
+  llvm::errs() << "LLVMDEBUG::KernelArgs.size() in emitTargetKernel = "
+               << KernelArgs.size() << "\n";
+
   for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
     llvm::Value *Arg =
         Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
@@ -1757,7 +1760,9 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-
+    llvm::errs() << "LLVMDEBUG::StaleCI is \n";
+    StaleCI->dump();
+    StaleCI->getParent()->getParent()->dump();
     // HasShareds is true if any variables are captured in the outlined region,
     // false otherwise.
     bool HasShareds = StaleCI->arg_size() > 1;
@@ -5234,7 +5239,46 @@ static void emitTargetOutlinedFunction(
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
                                       OutlinedFn, OutlinedFnID);
 }
-
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
+    IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
+    EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+    Value *DeviceID, Value *RTLoc) {
+
+  // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
+  // "target.task.exit");
+  BasicBlock *TargetTaskBodyBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
+  BasicBlock *TargetTaskAllocaBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
+
+  InsertPointTy TargetTaskAllocaIP =
+      InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+  InsertPointTy TargetTaskBodyIP =
+      InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+
+  {
+    // debug prints block
+    llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
+    Builder.GetInsertBlock()->dump();
+    llvm::errs()
+        << "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
+    Builder.GetInsertBlock()->getParent()->getParent()->dump();
+  }
+  Builder.restoreIP(TargetTaskBodyIP);
+  Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
+                                     EmitTargetCallFallbackCB, Args, DeviceID,
+                                     RTLoc, TargetTaskAllocaIP));
+  {
+    // debug prints block
+    llvm::errs()
+        << "Insert block after emitKernelLaunch in emittargettask is \n";
+    Builder.GetInsertBlock()->dump();
+    llvm::errs()
+        << "LLVMDEBUG:: module after emitKernelLaunch in emittargettask is \n";
+    Builder.GetInsertBlock()->getParent()->getParent()->dump();
+  }
+  return Builder.saveIP();
+}
 static void emitTargetCall(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
@@ -5288,14 +5332,44 @@ static void emitTargetCall(
   //       Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
   //       DeviceID, RTLoc, AllocaIP));
   // else {
-  //     create task
-  //     make task call emitkernellaunch.
-  //     make task call
+  //   codegen_callback = codegen callback to create task logic which should be
+  //   received from openmptollvmirtranslation + emitkernellaunch
+  //   create_task(codegen_callback)
+  //   make task call
   // }
   //
-  Builder.restoreIP(OMPBuilder.emitKernelLaunch(
-      Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
-      DeviceID, RTLoc, AllocaIP));
+  {
+    // Debug block
+    llvm::errs() << "Outlined Target Func is \n";
+    OutlinedFn->dump();
+    llvm::errs() << "CurrentInsertBlock is \n";
+    if (Builder.GetInsertBlock()) {
+      Builder.GetInsertBlock()->dump();
+      llvm::errs() << "Builder.GetInsertBlock = " << Builder.GetInsertBlock()
+                   << "\n";
+    } else
+      llvm::errs() << "CurrentInsertBlock not set\n";
+
+    OpenMPIRBuilder::InsertPointTy IP = Builder.saveIP();
+    if (IP.getBlock() == nullptr) {
+      llvm::errs() << "InsertPoint block is null\n";
+    } else {
+      llvm::errs() << "IP.getBlock() = " << IP.getBlock() << "\n";
+    }
+    llvm::errs() << "AllocaIP = \n";
+    llvm::errs() << "Block:\n";
+    AllocaIP.getBlock()->dump();
+    llvm::errs() << "Point:\n";
+    AllocaIP.getPoint()->dump();
+  }
+  if (NewOMPIRBuilderTargetCodegen) {
+    OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
+                              EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc);
+  } else {
+    Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+        Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+        DeviceID, RTLoc, AllocaIP));
+  }
 }
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
@@ -5615,6 +5689,9 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     return;
 
   Builder.restoreIP(AllocaIP);
+  llvm::errs() << "LLVMDEBUG::Before emitOffloadingArrays in "
+                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+  Builder.GetInsertBlock()->dump();
   // Detect if we have any capture size requiring runtime evaluation of the
   // size so that a constant array could be eventually used.
   ArrayType *PointerArrayType =

>From 53759cc4873bb5054d0f5d6ce9a12c80b6302cfe Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 20 May 2024 15:38:49 -0500
Subject: [PATCH 05/24] checkpoint commit

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 119 +++++++++++++++---
 2 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 10a85e72ae7dc..70845b543e2fa 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1770,7 +1770,7 @@ class OpenMPIRBuilder {
                                Value *OutlinedFnID,
                                EmitFallbackCallbackTy EmitTargetCallFallbackCB,
                                TargetKernelArgs &Args, Value *DeviceID,
-                               Value *RTLoc);
+                               Value *RTLoc, InsertPointTy AllocaIP);
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e02794c255ca0..889caf8e40f8a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -359,6 +359,41 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
   return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
 }
 
+// This function creates a fake integer value and a fake use for the integer
+// value. It returns the fake value created. This is useful in modeling the
+// extra arguments to the outlined functions.
+Value *createFakeIntVal(IRBuilderBase &Builder,
+                        OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
+                        std::stack<Instruction *> &ToBeDeleted,
+                        OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
+                        const Twine &Name = "", bool AsPtr = true) {
+  Builder.restoreIP(OuterAllocaIP);
+  Instruction *FakeVal;
+  AllocaInst *FakeValAddr =
+      Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+  ToBeDeleted.push(FakeValAddr);
+
+  if (AsPtr) {
+    FakeVal = FakeValAddr;
+  } else {
+    FakeVal =
+        Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+    ToBeDeleted.push(FakeVal);
+  }
+
+  // Generate a fake use of this value
+  Builder.restoreIP(InnerAllocaIP);
+  Instruction *UseFakeVal;
+  if (AsPtr) {
+    UseFakeVal =
+        Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+  } else {
+    UseFakeVal =
+        cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
+  }
+  ToBeDeleted.push(UseFakeVal);
+  return FakeVal;
+}
 // This function creates a fake integer value and a fake use for the integer
 // value. It returns the fake value created. This is useful in modeling the
 // extra arguments to the outlined functions.
@@ -1049,8 +1084,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
       Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
   Builder.restoreIP(Loc.IP);
 
-  llvm::errs() << "LLVMDEBUG::KernelArgs.size() in emitTargetKernel = "
-               << KernelArgs.size() << "\n";
+  LLVM_DEBUG(dbgs() << "KernelArgs.size() in emitTargetKernel = "
+                    << KernelArgs.size() << "\n");
 
   for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
     llvm::Value *Arg =
@@ -1760,9 +1795,13 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    llvm::errs() << "LLVMDEBUG::StaleCI is \n";
-    StaleCI->dump();
-    StaleCI->getParent()->getParent()->dump();
+    LLVM_DEBUG(dbgs() << "StaleCI =" << *StaleCI << "\n");
+    LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
+                      << *(StaleCI->getParent()->getParent()) << "\n");
+
+    // llvm::errs() << "LLVMDEBUG::StaleCI is \n";
+    // StaleCI->dump();
+    // StaleCI->getParent()->getParent()->dump();
     // HasShareds is true if any variables are captured in the outlined region,
     // false otherwise.
     bool HasShareds = StaleCI->arg_size() > 1;
@@ -5222,6 +5261,14 @@ static Function *createOutlinedFunction(
   return Func;
 }
 
+// define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
+// %1) #3 {
+static void
+emitProxyTaskFunction(OpenMPIRBuilder::InsertPointTy ProxyFnCallSiteIP) {
+  // Create a function with the following signature
+  LLVMContext &Ctx = ProxyFnCallSiteIP.getBlock()->getContext();
+  Type *ThreadIDTy = Type::getInt32Ty(Ctx);
+}
 static void emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
@@ -5242,7 +5289,7 @@ static void emitTargetOutlinedFunction(
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
     EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
-    Value *DeviceID, Value *RTLoc) {
+    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
 
   // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
   // "target.task.exit");
@@ -5255,7 +5302,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
       InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
   InsertPointTy TargetTaskBodyIP =
       InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
-
+#if 0
   {
     // debug prints block
     llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
@@ -5264,19 +5311,51 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
         << "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
     Builder.GetInsertBlock()->getParent()->getParent()->dump();
   }
+#endif
+  OutlineInfo OI;
+  OI.EntryBB = TargetTaskAllocaBB;
+  OI.OuterAllocaBB = AllocaIP.getBlock();
+
+  // Add the thread ID argument.
+  std::stack<Instruction *> ToBeDeleted;
+  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+      Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
   Builder.restoreIP(TargetTaskBodyIP);
+
   Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
                                      EmitTargetCallFallbackCB, Args, DeviceID,
                                      RTLoc, TargetTaskAllocaIP));
+  OI.ExitBB = Builder.saveIP().getBlock();
+  // OI.PostOutlineCB = [this,
+  //                     TargetTaskAllocaBB, ToBeDeleted](Function &OutlinedFn)
+  //                     mutable {
+
+  //   assert(OutlinedFn.getNumUses() == 1 &&
+  //          "there must be a single user for the outlined function");
+  //   CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+  //   llvm::errs() << "LLVMDEBUG::StaleCI in postline for targettask\n";
+  //   StaleCI->dump();
+  //   StaleCI->getParent()->getParent()->getParent()->dump();
+
+  //   emitProxyTaskFunction(InsertPointTy(StaleCI->getParent(),
+  //   StaleCI->getIterator()));
+
+  //   // while (!ToBeDeleted.empty()) {
+  //   //   ToBeDeleted.top()->eraseFromParent();
+  //   //   ToBeDeleted.pop();
+  //   // }
+  // };
+  addOutlineInfo(std::move(OI));
+#if 1
   {
     // debug prints block
-    llvm::errs()
-        << "Insert block after emitKernelLaunch in emittargettask is \n";
-    Builder.GetInsertBlock()->dump();
-    llvm::errs()
-        << "LLVMDEBUG:: module after emitKernelLaunch in emittargettask is \n";
-    Builder.GetInsertBlock()->getParent()->getParent()->dump();
+    LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
+                      << *(Builder.GetInsertBlock()) << "\n");
+    LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
+                      << *(Builder.GetInsertBlock()->getParent()->getParent())
+                      << "\n");
   }
+#endif
   return Builder.saveIP();
 }
 static void emitTargetCall(
@@ -5338,6 +5417,7 @@ static void emitTargetCall(
   //   make task call
   // }
   //
+#if 0
   {
     // Debug block
     llvm::errs() << "Outlined Target Func is \n";
@@ -5362,9 +5442,11 @@ static void emitTargetCall(
     llvm::errs() << "Point:\n";
     AllocaIP.getPoint()->dump();
   }
+#endif
   if (NewOMPIRBuilderTargetCodegen) {
     OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
-                              EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc);
+                              EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
+                              AllocaIP);
   } else {
     Builder.restoreIP(OMPBuilder.emitKernelLaunch(
         Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
@@ -5380,12 +5462,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
     SmallVector<DependData> Dependencies) {
   if (!NewOMPIRBuilderTargetCodegen) {
-    llvm::errs() << "Old OpenMPIRBuilder target codegen\n";
+    LLVM_DEBUG(dbgs() << "Old OpenMPIRBuilder target codegen\n");
     return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
                         NumThreads, Args, GenMapInfoCB, CBFunc,
                         ArgAccessorFuncCB);
   }
-  llvm::errs() << "New OpenMPIRBuilder target codegen\n";
+  LLVM_DEBUG(dbgs() << "New OpenMPIRBuilder target codegen\n");
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -5689,9 +5771,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     return;
 
   Builder.restoreIP(AllocaIP);
-  llvm::errs() << "LLVMDEBUG::Before emitOffloadingArrays in "
-                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
-  Builder.GetInsertBlock()->dump();
+  LLVM_DEBUG(dbgs() << "Basicblock before emitOffloadingArrays\n"
+                    << *(Builder.GetInsertBlock()) << "\n");
   // Detect if we have any capture size requiring runtime evaluation of the
   // size so that a constant array could be eventually used.
   ArrayType *PointerArrayType =

>From 83cea83f05ff1da0cc80d08f7635e52492c854ed Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 22 May 2024 00:55:12 -0500
Subject: [PATCH 06/24] checkpoint commit

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   9 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 212 +++++++++++++++---
 llvm/lib/IR/Instruction.cpp                   |   5 +
 3 files changed, 196 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 70845b543e2fa..5c0eedbfc9a45 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -592,7 +592,11 @@ class OpenMPIRBuilder {
   /// (filename, line, column, ...).
   struct LocationDescription {
     LocationDescription(const IRBuilderBase &IRB)
-        : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
+        : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {
+      llvm::errs() << "In LocationDescription(const IRBuilderBase &), "
+                      "IRB.GetInsertBlock() = "
+                   << *IRB.GetInsertBlock() << "\n";
+    }
     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
         : IP(IP), DL(DL) {}
@@ -1766,8 +1770,7 @@ class OpenMPIRBuilder {
       const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
-  InsertPointTy emitTargetTask(IRBuilderBase &Builder, Function *OutlinedFn,
-                               Value *OutlinedFnID,
+  InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID,
                                EmitFallbackCallbackTy EmitTargetCallFallbackCB,
                                TargetKernelArgs &Args, Value *DeviceID,
                                Value *RTLoc, InsertPointTy AllocaIP);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 889caf8e40f8a..eeccf9cf64656 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -609,6 +609,10 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
 
 FunctionCallee
 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
+  LLVM_DEBUG(dbgs() << "getOrCreateRuntimeFunction:Builder.GetInsertBlock() = "
+                    << *Builder.GetInsertBlock() << "\n");
+  LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
+                    << "\n");
   FunctionType *FnTy = nullptr;
   Function *Fn = nullptr;
 
@@ -655,6 +659,10 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
     addAttributes(FnID, *Fn);
 
   } else {
+    LLVM_DEBUG(dbgs() << "{else}Builder.GetInsertBlock() = "
+                      << *Builder.GetInsertBlock() << "\n");
+    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+                      << Builder.GetInsertBlock() << "\n");
     LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
                       << " with type " << *Fn->getFunctionType() << "\n");
   }
@@ -958,6 +966,11 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
 }
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
+  LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
+  LLVM_DEBUG(dbgs() << "getORCreateThreadID:Builder.GetInsertBlock() = "
+                    << *Builder.GetInsertBlock() << "\n");
+  LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
+                    << "\n");
   return Builder.CreateCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
       "omp_global_thread_num");
@@ -1799,9 +1812,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
     LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
                       << *(StaleCI->getParent()->getParent()) << "\n");
 
-    // llvm::errs() << "LLVMDEBUG::StaleCI is \n";
-    // StaleCI->dump();
-    // StaleCI->getParent()->getParent()->dump();
     // HasShareds is true if any variables are captured in the outlined region,
     // false otherwise.
     bool HasShareds = StaleCI->arg_size() > 1;
@@ -5263,11 +5273,71 @@ static Function *createOutlinedFunction(
 
 // define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
 // %1) #3 {
-static void
-emitProxyTaskFunction(OpenMPIRBuilder::InsertPointTy ProxyFnCallSiteIP) {
+static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
+                                       IRBuilderBase &Builder,
+                                       CallInst *StaleCI) {
   // Create a function with the following signature
-  LLVMContext &Ctx = ProxyFnCallSiteIP.getBlock()->getContext();
+  // define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
+  // %1) #3 {
+  Module &M = OMPBuilder.M;
+  Function *CalledFunction = StaleCI->getCalledFunction();
+  OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
+                                    StaleCI->getIterator());
+  LLVMContext &Ctx = StaleCI->getParent()->getContext();
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
+  Type *TaskPtrTy = OMPBuilder.TaskPtr;
+  Type *TaskTy = OMPBuilder.Task;
+  auto ProxyFnTy =
+      FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
+                        /* isVarArg */ false);
+  auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
+                                  ".omp_target_task_proxy_func",
+                                  Builder.GetInsertBlock()->getModule());
+  auto OldInsertPoint = Builder.saveIP();
+
+  BasicBlock *EntryBB =
+      BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  bool HasShareds = StaleCI->arg_size() > 1;
+  // PDB: Temporary assert.
+  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
+         "StaleCI with shareds should have exactly two arguments.");
+  if (HasShareds) {
+    AllocaInst *ArgStructAlloca =
+        dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    assert(ArgStructAlloca &&
+           "Unable to find the alloca instruction corresponding to arguments "
+           "for extracted function");
+    StructType *ArgStructType =
+        dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+    LLVM_DEBUG(dbgs() << "ArgStructType = " << *ArgStructType << "\n");
+
+    AllocaInst *NewArgStructAlloca =
+        Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
+    Value *TaskT = ProxyFn->getArg(1);
+    Value *ThreadId = ProxyFn->getArg(0);
+    LLVM_DEBUG(dbgs() << "TaskT = " << *TaskT << "\n");
+    Value *SharedsSize =
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+
+    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+    LoadInst *LoadShared =
+        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+
+    // TODO: Are these alignment values correct?
+    Builder.CreateMemCpy(
+        NewArgStructAlloca,
+        NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), Shareds,
+        LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
+
+    Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
+  }
+  CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
+  CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldInsertPoint);
+  return ProxyFn;
 }
 static void emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
@@ -5287,10 +5357,12 @@ static void emitTargetOutlinedFunction(
                                       OutlinedFn, OutlinedFnID);
 }
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
-    IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
+    Function *OutlinedFn, Value *OutlinedFnID,
     EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
     Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
 
+  LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
+                    << ", Builder = " << &Builder << "\n");
   // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
   // "target.task.exit");
   BasicBlock *TargetTaskBodyBB =
@@ -5322,29 +5394,111 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
       Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
   Builder.restoreIP(TargetTaskBodyIP);
 
+  // emitKernelLaunch makes the necessary runtime call to offload the kernel.
+  // We then outline all that code into a separate function that is called
+  // by the task wrapper function (aka Proxy task function - see
+  // emitProxyTaskFunction)
   Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
                                      EmitTargetCallFallbackCB, Args, DeviceID,
                                      RTLoc, TargetTaskAllocaIP));
   OI.ExitBB = Builder.saveIP().getBlock();
-  // OI.PostOutlineCB = [this,
-  //                     TargetTaskAllocaBB, ToBeDeleted](Function &OutlinedFn)
-  //                     mutable {
-
-  //   assert(OutlinedFn.getNumUses() == 1 &&
-  //          "there must be a single user for the outlined function");
-  //   CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-  //   llvm::errs() << "LLVMDEBUG::StaleCI in postline for targettask\n";
-  //   StaleCI->dump();
-  //   StaleCI->getParent()->getParent()->getParent()->dump();
-
-  //   emitProxyTaskFunction(InsertPointTy(StaleCI->getParent(),
-  //   StaleCI->getIterator()));
-
-  //   // while (!ToBeDeleted.empty()) {
-  //   //   ToBeDeleted.top()->eraseFromParent();
-  //   //   ToBeDeleted.pop();
-  //   // }
-  // };
+  OI.PostOutlineCB = [this, ToBeDeleted](Function &OutlinedFn) mutable {
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "there must be a single user for the outlined function");
+    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+    bool HasShareds = StaleCI->arg_size() > 1;
+
+    LLVM_DEBUG(dbgs() << "StaleCI in PostOutlineCB in emitTargetTask = "
+                      << *StaleCI << "\n");
+    LLVM_DEBUG(dbgs() << "Module in PostOutlineCB in emitTargetTask = "
+                      << *(StaleCI->getParent()->getParent()->getParent())
+                      << "\n");
+
+    Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+    LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
+                      << "\n");
+
+    Builder.SetInsertPoint(StaleCI);
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr =
+        getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
+    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+    // Gather the arguments for emitting the runtime call for
+    // @__kmpc_omp_task_alloc
+    Function *TaskAllocFn =
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+    // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
+    // call.
+    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+                      << *(Builder.GetInsertBlock()) << "\n");
+    LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint() = "
+                      << *(Builder.GetInsertPoint()) << "\n");
+    LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint()->getParent() = "
+                      << Builder.GetInsertPoint()->getParent() << "\n");
+    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+                      << Builder.GetInsertBlock() << "\n");
+    LLVM_DEBUG(dbgs() << "In the Callback: OMPBuilder.Builder = "
+                      << &this->Builder << ", Builder = " << &Builder << "\n");
+    LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
+    Value *ThreadID = getOrCreateThreadID(Ident);
+
+    // TODO : Task tied or not? See what clang does.
+
+    // Argument - `sizeof_kmp_task_t` (TaskSize)
+    // Tasksize refers to the size in bytes of kmp_task_t data structure
+    // including private vars accessed in task.
+    // TODO: add kmp_task_t_with_privates (privates)
+    Value *TaskSize = Builder.getInt64(
+        divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
+
+    // Argument - `sizeof_shareds` (SharedsSize)
+    // SharedsSize refers to the shareds array size in the kmp_task_t data
+    // structure.
+    Value *SharedsSize = Builder.getInt64(0);
+    if (HasShareds) {
+      AllocaInst *ArgStructAlloca =
+          dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      assert(ArgStructAlloca &&
+             "Unable to find the alloca instruction corresponding to arguments "
+             "for extracted function");
+      StructType *ArgStructType =
+          dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+      assert(ArgStructType && "Unable to find struct type corresponding to "
+                              "arguments for extracted function");
+      SharedsSize =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+    }
+
+    // Argument - `flags`
+    // Task is tied iff (Flags & 1) == 1.
+    // Task is untied iff (Flags & 1) == 0.
+    // Task is final iff (Flags & 2) == 2.
+    // Task is not final iff (Flags & 2) == 0.
+    // A target task is not final and is untied.
+    Value *Flags = Builder.getInt32(0);
+
+    // Emit the @__kmpc_omp_task_alloc runtime call
+    // The runtime call returns a pointer to an area where the task captured
+    // variables must be copied before the task is run (TaskData)
+    CallInst *TaskData = Builder.CreateCall(
+        TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+                      /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
+                      /*task_func=*/ProxyFn});
+
+    if (HasShareds) {
+      Value *Shareds = StaleCI->getArgOperand(1);
+      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+      Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+                           SharedsSize);
+    }
+
+    // while (!ToBeDeleted.empty()) {
+    //   ToBeDeleted.top()->eraseFromParent();
+    //   ToBeDeleted.pop();
+    // }
+  };
   addOutlineInfo(std::move(OI));
 #if 1
   {
@@ -5374,6 +5528,8 @@ static void emitTargetCall(
   OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
                                   /*IsNonContiguous=*/true);
 
+  LLVM_DEBUG(dbgs() << "OMPBuilder.Builder = " << &OMPBuilder.Builder
+                    << ", Builder = " << &Builder << "\n");
   OpenMPIRBuilder::TargetDataRTArgs RTArgs;
   OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
                                           !MapInfo.Names.empty());
@@ -5381,6 +5537,8 @@ static void emitTargetCall(
   //  emitKernelLaunch
   auto &&EmitTargetCallFallbackCB =
       [&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy {
+    LLVM_DEBUG(dbgs() << "EmitTargetCallFallbackCB::Builder = " << &Builder
+                      << "\n");
     Builder.restoreIP(IP);
     Builder.CreateCall(OutlinedFn, Args);
     return Builder.saveIP();
@@ -5444,7 +5602,7 @@ static void emitTargetCall(
   }
 #endif
   if (NewOMPIRBuilderTargetCodegen) {
-    OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
+    OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
                               EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
                               AllocaIP);
   } else {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 29272e627a1d1..22f5b5a41fc3f 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -128,6 +128,11 @@ void Instruction::insertAfter(Instruction *InsertPos) {
 BasicBlock::iterator Instruction::insertInto(BasicBlock *ParentBB,
                                              BasicBlock::iterator It) {
   assert(getParent() == nullptr && "Expected detached instruction");
+  if (!(It == ParentBB->end() || It->getParent() == ParentBB)) {
+    llvm::errs() << "ParentBB = " << *ParentBB << "\n";
+    llvm::errs() << "It = " << *It << "\n";
+    llvm::errs() << "It->getParent() = " << *It->getParent() << "\n";
+  }
   assert((It == ParentBB->end() || It->getParent() == ParentBB) &&
          "It not in ParentBB");
   insertBefore(*ParentBB, It);

>From d5c2449079d292de324d9a72ddd0c94d38a1b1f8 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 24 May 2024 01:30:58 -0500
Subject: [PATCH 07/24] Simple test working. checkpoint commit. next steps
 clean up code, unittests(?) and lit tests - basically more testing before PR

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  22 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 292 ++++++++++--------
 llvm/lib/IR/BasicBlock.cpp                    |   6 +-
 llvm/lib/IR/Function.cpp                      |  16 +-
 llvm/lib/IR/Module.cpp                        |   4 +-
 mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp    |   3 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   2 -
 7 files changed, 191 insertions(+), 154 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5c0eedbfc9a45..8a67cd4b8d9f0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -592,11 +592,7 @@ class OpenMPIRBuilder {
   /// (filename, line, column, ...).
   struct LocationDescription {
     LocationDescription(const IRBuilderBase &IRB)
-        : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {
-      llvm::errs() << "In LocationDescription(const IRBuilderBase &), "
-                      "IRB.GetInsertBlock() = "
-                   << *IRB.GetInsertBlock() << "\n";
-    }
+        : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
         : IP(IP), DL(DL) {}
@@ -1522,12 +1518,7 @@ class OpenMPIRBuilder {
   std::forward_list<CanonicalLoopInfo> LoopInfos;
 
   /// Add a new region that will be outlined later.
-  void addOutlineInfo(OutlineInfo &&OI) {
-    llvm::errs() << "Adding outline info\n";
-    llvm::errs() << "OI.EntryBB = ";
-    OI.EntryBB->dump();
-    OutlineInfos.emplace_back(OI);
-  }
+  void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
 
   /// An ordered map of auto-generated variables to their unique names.
   /// It stores variables with the following names: 1) ".gomp_critical_user_" +
@@ -1770,10 +1761,11 @@ class OpenMPIRBuilder {
       const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
-  InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID,
-                               EmitFallbackCallbackTy EmitTargetCallFallbackCB,
-                               TargetKernelArgs &Args, Value *DeviceID,
-                               Value *RTLoc, InsertPointTy AllocaIP);
+  InsertPointTy emitTargetTask(
+      Function *OutlinedFn, Value *OutlinedFnID,
+      EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+      Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
+      SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index eeccf9cf64656..be0717898ff25 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -394,41 +394,42 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
   ToBeDeleted.push(UseFakeVal);
   return FakeVal;
 }
-// This function creates a fake integer value and a fake use for the integer
-// value. It returns the fake value created. This is useful in modeling the
-// extra arguments to the outlined functions.
-Value *createFakeIntVal(IRBuilder<> &Builder,
-                        OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
-                        std::stack<Instruction *> &ToBeDeleted,
-                        OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
-                        const Twine &Name = "", bool AsPtr = true) {
-  Builder.restoreIP(OuterAllocaIP);
-  Instruction *FakeVal;
-  AllocaInst *FakeValAddr =
-      Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
-  ToBeDeleted.push(FakeValAddr);
-
-  if (AsPtr) {
-    FakeVal = FakeValAddr;
-  } else {
-    FakeVal =
-        Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
-    ToBeDeleted.push(FakeVal);
-  }
-
-  // Generate a fake use of this value
-  Builder.restoreIP(InnerAllocaIP);
-  Instruction *UseFakeVal;
-  if (AsPtr) {
-    UseFakeVal =
-        Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
-  } else {
-    UseFakeVal =
-        cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
-  }
-  ToBeDeleted.push(UseFakeVal);
-  return FakeVal;
-}
+// // This function creates a fake integer value and a fake use for the integer
+// // value. It returns the fake value created. This is useful in modeling the
+// // extra arguments to the outlined functions.
+// Value *createFakeIntVal(IRBuilder<> &Builder,
+//                         OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
+//                         std::stack<Instruction *> &ToBeDeleted,
+//                         OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
+//                         const Twine &Name = "", bool AsPtr = true) {
+//   Builder.restoreIP(OuterAllocaIP);
+//   Instruction *FakeVal;
+//   AllocaInst *FakeValAddr =
+//       Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+//   ToBeDeleted.push(FakeValAddr);
+
+//   if (AsPtr) {
+//     FakeVal = FakeValAddr;
+//   } else {
+//     FakeVal =
+//         Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+//     ToBeDeleted.push(FakeVal);
+//   }
+
+//   // Generate a fake use of this value
+//   Builder.restoreIP(InnerAllocaIP);
+//   Instruction *UseFakeVal;
+//   if (AsPtr) {
+//     UseFakeVal =
+//         Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+//   } else {
+//     UseFakeVal =
+//         cast<BinaryOperator>(Builder.CreateAdd(FakeVal,
+//         Builder.getInt32(10)));
+//   }
+//   ToBeDeleted.push(UseFakeVal);
+//   return FakeVal;
+// }
 
 //===----------------------------------------------------------------------===//
 // OpenMPIRBuilderConfig
@@ -609,10 +610,6 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
 
 FunctionCallee
 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
-  LLVM_DEBUG(dbgs() << "getOrCreateRuntimeFunction:Builder.GetInsertBlock() = "
-                    << *Builder.GetInsertBlock() << "\n");
-  LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
-                    << "\n");
   FunctionType *FnTy = nullptr;
   Function *Fn = nullptr;
 
@@ -659,10 +656,6 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
     addAttributes(FnID, *Fn);
 
   } else {
-    LLVM_DEBUG(dbgs() << "{else}Builder.GetInsertBlock() = "
-                      << *Builder.GetInsertBlock() << "\n");
-    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
-                      << Builder.GetInsertBlock() << "\n");
     LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
                       << " with type " << *Fn->getFunctionType() << "\n");
   }
@@ -966,11 +959,6 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
 }
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
-  LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
-  LLVM_DEBUG(dbgs() << "getORCreateThreadID:Builder.GetInsertBlock() = "
-                    << *Builder.GetInsertBlock() << "\n");
-  LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
-                    << "\n");
   return Builder.CreateCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
       "omp_global_thread_num");
@@ -1097,9 +1085,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
       Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
   Builder.restoreIP(Loc.IP);
 
-  LLVM_DEBUG(dbgs() << "KernelArgs.size() in emitTargetKernel = "
-                    << KernelArgs.size() << "\n");
-
   for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
     llvm::Value *Arg =
         Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
@@ -1753,6 +1738,54 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
     return;
   emitTaskyieldImpl(Loc);
 }
+static Value *
+emitDepArray(OpenMPIRBuilder &OMPBuilder,
+             SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+  IRBuilderBase &Builder = OMPBuilder.Builder;
+  Type *DependInfo = OMPBuilder.DependInfo;
+  Module &M = OMPBuilder.M;
+
+  Value *DepArray = nullptr;
+  if (Dependencies.size()) {
+    OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+    Builder.SetInsertPoint(
+        &OldIP.getBlock()->getParent()->getEntryBlock().back());
+
+    Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
+    DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
+
+    unsigned P = 0;
+    for (const OpenMPIRBuilder::DependData &Dep : Dependencies) {
+      Value *Base =
+          Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
+      // Store the pointer to the variable
+      Value *Addr = Builder.CreateStructGEP(
+          DependInfo, Base,
+          static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
+      Value *DepValPtr =
+          Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
+      Builder.CreateStore(DepValPtr, Addr);
+      // Store the size of the variable
+      Value *Size = Builder.CreateStructGEP(
+          DependInfo, Base,
+          static_cast<unsigned int>(RTLDependInfoFields::Len));
+      Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize(
+                              Dep.DepValueType)),
+                          Size);
+      // Store the dependency kind
+      Value *Flags = Builder.CreateStructGEP(
+          DependInfo, Base,
+          static_cast<unsigned int>(RTLDependInfoFields::Flags));
+      Builder.CreateStore(
+          ConstantInt::get(Builder.getInt8Ty(),
+                           static_cast<unsigned int>(Dep.DepKind)),
+          Flags);
+      ++P;
+    }
+    Builder.restoreIP(OldIP);
+  }
+  return DepArray;
+}
 
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTask(const LocationDescription &Loc,
@@ -1808,9 +1841,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    LLVM_DEBUG(dbgs() << "StaleCI =" << *StaleCI << "\n");
-    LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
-                      << *(StaleCI->getParent()->getParent()) << "\n");
 
     // HasShareds is true if any variables are captured in the outlined region,
     // false otherwise.
@@ -5293,7 +5323,7 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
-  auto OldInsertPoint = Builder.saveIP();
+  //  auto OldInsertPoint = Builder.saveIP();
 
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
@@ -5328,15 +5358,17 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
     // TODO: Are these alignment values correct?
     Builder.CreateMemCpy(
         NewArgStructAlloca,
-        NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), Shareds,
+        NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), LoadShared,
         LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
 
     Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
   }
-  CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
-  CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+  // CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
+  // CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+  ProxyFn->getArg(0)->setName("thread.id");
+  ProxyFn->getArg(1)->setName("task");
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldInsertPoint);
+  //  Builder.restoreIP(OldInsertPoint);
   return ProxyFn;
 }
 static void emitTargetOutlinedFunction(
@@ -5359,12 +5391,13 @@ static void emitTargetOutlinedFunction(
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     Function *OutlinedFn, Value *OutlinedFnID,
     EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
-    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
+    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+    SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
+    bool HasNoWait) {
 
   LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
                     << ", Builder = " << &Builder << "\n");
-  // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
-  // "target.task.exit");
+
   BasicBlock *TargetTaskBodyBB =
       splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
   BasicBlock *TargetTaskAllocaBB =
@@ -5374,16 +5407,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
       InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
   InsertPointTy TargetTaskBodyIP =
       InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
-#if 0
-  {
-    // debug prints block
-    llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
-    Builder.GetInsertBlock()->dump();
-    llvm::errs()
-        << "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
-    Builder.GetInsertBlock()->getParent()->getParent()->dump();
-  }
-#endif
+
   OutlineInfo OI;
   OI.EntryBB = TargetTaskAllocaBB;
   OI.OuterAllocaBB = AllocaIP.getBlock();
@@ -5392,6 +5416,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   std::stack<Instruction *> ToBeDeleted;
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
+
   Builder.restoreIP(TargetTaskBodyIP);
 
   // emitKernelLaunch makes the necessary runtime call to offload the kernel.
@@ -5402,9 +5427,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
                                      EmitTargetCallFallbackCB, Args, DeviceID,
                                      RTLoc, TargetTaskAllocaIP));
   OI.ExitBB = Builder.saveIP().getBlock();
-  OI.PostOutlineCB = [this, ToBeDeleted](Function &OutlinedFn) mutable {
+  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
+                      HasNoWait](Function &OutlinedFn) mutable {
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
+
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
     bool HasShareds = StaleCI->arg_size() > 1;
 
@@ -5419,32 +5446,21 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
                       << "\n");
 
     Builder.SetInsertPoint(StaleCI);
+
+    // Gather the arguments for emitting the runtime call for
     uint32_t SrcLocStrSize;
     Constant *SrcLocStr =
         getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
     Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-    // Gather the arguments for emitting the runtime call for
+
     // @__kmpc_omp_task_alloc
     Function *TaskAllocFn =
         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
 
     // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
     // call.
-    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
-                      << *(Builder.GetInsertBlock()) << "\n");
-    LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint() = "
-                      << *(Builder.GetInsertPoint()) << "\n");
-    LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint()->getParent() = "
-                      << Builder.GetInsertPoint()->getParent() << "\n");
-    LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
-                      << Builder.GetInsertBlock() << "\n");
-    LLVM_DEBUG(dbgs() << "In the Callback: OMPBuilder.Builder = "
-                      << &this->Builder << ", Builder = " << &Builder << "\n");
-    LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
     Value *ThreadID = getOrCreateThreadID(Ident);
 
-    // TODO : Task tied or not? See what clang does.
-
     // Argument - `sizeof_kmp_task_t` (TaskSize)
     // Tasksize refers to the size in bytes of kmp_task_t data structure
     // including private vars accessed in task.
@@ -5493,23 +5509,65 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
                            SharedsSize);
     }
+    if (Dependencies.size()) {
+      Value *DepArray = emitDepArray(*this, Dependencies);
+      Function *TaskWaitFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+      Builder.CreateCall(
+          TaskWaitFn,
+          {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
+           ConstantInt::get(Builder.getInt32Ty(), 0),
+           ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+    }
 
-    // while (!ToBeDeleted.empty()) {
-    //   ToBeDeleted.top()->eraseFromParent();
-    //   ToBeDeleted.pop();
+    // ---------------------------------------------------------------
+    // V5.2 13.8 target construct
+    // If the nowait clause is present, execution of the target task
+    // may be deferred. If the nowait clause is not present, the target task is
+    // an included task.
+    // ---------------------------------------------------------------
+    // The above means that the lack of a nowait on the target construct
+    // translates to '#pragma omp task if(0)'
+    if (!HasNoWait) {
+      // Included task.
+      Function *TaskBeginFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
+      Function *TaskCompleteFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
+      Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
+      CallInst *CI = nullptr;
+      if (HasShareds)
+        CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
+      else
+        CI = Builder.CreateCall(ProxyFn, {ThreadID});
+      CI->setDebugLoc(StaleCI->getDebugLoc());
+      Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
+    } else {
+      // Emit the @__kmpc_omp_task runtime call to spawn the task
+      Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
+      Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
+    }
+
+    StaleCI->eraseFromParent();
+    // Builder.SetInsertPoint(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+    // if (HasShareds) {
+    //   LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+    //   OutlinedFn.getArg(1)->replaceUsesWithIf(
+    //       Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
     // }
+
+    while (!ToBeDeleted.empty()) {
+      ToBeDeleted.top()->eraseFromParent();
+      ToBeDeleted.pop();
+    }
   };
   addOutlineInfo(std::move(OI));
-#if 1
-  {
-    // debug prints block
-    LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
-                      << *(Builder.GetInsertBlock()) << "\n");
-    LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
-                      << *(Builder.GetInsertBlock()->getParent()->getParent())
-                      << "\n");
-  }
-#endif
+
+  LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
+                    << *(Builder.GetInsertBlock()) << "\n");
+  LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
+                    << *(Builder.GetInsertBlock()->getParent()->getParent())
+                    << "\n");
   return Builder.saveIP();
 }
 static void emitTargetCall(
@@ -5518,7 +5576,7 @@ static void emitTargetCall(
     Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
     SmallVectorImpl<Value *> &Args,
     OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
-    SmallVector<llvm::OpenMPIRBuilder::DependData> dependencies = {}) {
+    SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {}) {
 
   OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
@@ -5559,6 +5617,8 @@ static void emitTargetCall(
   Value *DynCGGroupMem = Builder.getInt32(0);
 
   bool HasNoWait = false;
+  bool HasDependencies = Dependencies.size() > 0;
+  bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
 
   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
                                           NumTeamsVal, NumThreadsVal,
@@ -5575,36 +5635,10 @@ static void emitTargetCall(
   //   make task call
   // }
   //
-#if 0
-  {
-    // Debug block
-    llvm::errs() << "Outlined Target Func is \n";
-    OutlinedFn->dump();
-    llvm::errs() << "CurrentInsertBlock is \n";
-    if (Builder.GetInsertBlock()) {
-      Builder.GetInsertBlock()->dump();
-      llvm::errs() << "Builder.GetInsertBlock = " << Builder.GetInsertBlock()
-                   << "\n";
-    } else
-      llvm::errs() << "CurrentInsertBlock not set\n";
-
-    OpenMPIRBuilder::InsertPointTy IP = Builder.saveIP();
-    if (IP.getBlock() == nullptr) {
-      llvm::errs() << "InsertPoint block is null\n";
-    } else {
-      llvm::errs() << "IP.getBlock() = " << IP.getBlock() << "\n";
-    }
-    llvm::errs() << "AllocaIP = \n";
-    llvm::errs() << "Block:\n";
-    AllocaIP.getBlock()->dump();
-    llvm::errs() << "Point:\n";
-    AllocaIP.getPoint()->dump();
-  }
-#endif
-  if (NewOMPIRBuilderTargetCodegen) {
+  if (NewOMPIRBuilderTargetCodegen && RequiresOuterTargetTask) {
     OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
                               EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
-                              AllocaIP);
+                              AllocaIP, Dependencies, HasNoWait);
   } else {
     Builder.restoreIP(OMPBuilder.emitKernelLaunch(
         Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 29f2cbf611fa3..205065aef6488 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -445,8 +445,12 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
 }
 
 void BasicBlock::dropAllReferences() {
-  for (Instruction &I : *this)
+  // bool debug_on = (this->getName() == "target.task.alloca");
+  for (Instruction &I : *this) {
+    LLVM_DEBUG(dbgs() << "Dropping all references in I = " << I << "\n");
     I.dropAllReferences();
+    LLVM_DEBUG(dbgs() << "After Dropping all references in I = " << I << "\n");
+  }
 }
 
 const BasicBlock *BasicBlock::getSinglePredecessor() const {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 13fa1afeaaff2..bb68755bea733 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ModRef.h"
 #include <cassert>
@@ -71,10 +72,10 @@
 #include <cstdint>
 #include <cstring>
 #include <string>
+#define DEBUG_TYPE "pranav"
 
 using namespace llvm;
 using ProfileCount = Function::ProfileCount;
-
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<BasicBlock>;
@@ -550,10 +551,15 @@ void Function::stealArgumentListFrom(Function &Src) {
 
 void Function::deleteBodyImpl(bool ShouldDrop) {
   setIsMaterializable(false);
-
-  for (BasicBlock &BB : *this)
+  bool OldDebugFlag = DebugFlag;
+  if (this->getName() == "_QQmain..omp_par.1") {
+    DebugFlag = true;
+  }
+  for (BasicBlock &BB : *this) {
+    LLVM_DEBUG(dbgs() << "Dropping all references in " << BB << "\n");
     BB.dropAllReferences();
-
+    LLVM_DEBUG(dbgs() << "After Dropping all references in " << BB << "\n");
+  }
   // Delete all basic blocks. They are now unused, except possibly by
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
@@ -573,7 +579,7 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
     }
     setValueSubclassData(getSubclassDataFromValue() & ~0xe);
   }
-
+  DebugFlag = OldDebugFlag;
   // Metadata is stored in a side-table.
   clearMetadata();
 }
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index f97dd18c736c5..4d986ded06f11 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -539,8 +539,10 @@ std::string Module::getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id,
 // has "dropped all references", except operator delete.
 //
 void Module::dropAllReferences() {
-  for (Function &F : *this)
+  for (Function &F : *this) {
+    // llvm::errs() << "Dropping all references in " << F.getName() << "\n";
     F.dropAllReferences();
+  }
 
   for (GlobalVariable &GV : globals())
     GV.dropAllReferences();
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index 4558893779534..c0c03df7cbc5d 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Tools/mlir-translate/Translation.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 
 using namespace mlir;
 
@@ -30,7 +31,7 @@ void registerToLLVMIRTranslation() {
         auto llvmModule = translateModuleToLLVMIR(op, llvmContext);
         if (!llvmModule)
           return failure();
-
+        llvm::verifyModule(*llvmModule);
         llvmModule->print(output, nullptr);
         return success();
       },
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 2fd3aef44ebd5..022ea3af7f58a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -731,7 +731,6 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   if (!taskOp.getDependVars().empty() && taskOp.getDepends())
     buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
                     moduleTranslation, dds);
-  llvm::errs() << "# Dependencies in task op = " << dds.size() << "\n";
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
@@ -3097,7 +3096,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   if (!targetOp.getDependVars().empty() && targetOp.getDepends())
     buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
                     moduleTranslation, dds);
-  llvm::errs() << "# Dependencies in target op = " << dds.size() << "\n";
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,

>From e32335698794a86fdf904739d01715445af81000 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 28 May 2024 15:31:06 -0500
Subject: [PATCH 08/24] clean up, clean up, everybody clean up

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp     | 23 -----------------------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp |  3 ---
 llvm/lib/IR/BasicBlock.cpp                |  6 +-----
 llvm/lib/IR/Function.cpp                  | 13 +++----------
 llvm/lib/IR/Module.cpp                    |  4 +---
 5 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f56c878b45df8..3dfc6bccc8c05 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1320,18 +1320,10 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
     HasCancel = TD->hasCancel();
 
   CodeGenFunction CGF(CGM, true);
-  // llvm::errs() << "LLVMDEBUG::Before CGInfo\n";
-  // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
                                         InnermostKind, HasCancel, Action);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
-  // llvm::errs() << "LLVMDEBUG::Before GenerateCapturedStmt\n";
-  // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   llvm::Function *Res = CGF.GenerateCapturedStmtFunction(*CS);
-  llvm::errs() << "LLVMDEBUG::After GenerateCapturedStmt\n";
-  llvm::errs() << "LLVMDEBUG::CapturedStmt is \n";
-  CS->dump();
-  CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
   if (!Tied)
     NumberOfParts = Action.getNumberOfParts();
   return Res;
@@ -3715,15 +3707,6 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
       KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
       TaskPrivatesMap);
 
-  llvm::errs() << "LLVMDEBUG::Proxy task function is \n";
-  TaskEntry->dump();
-  llvm::errs() << "LLVMDEBUG::CGF.Builder.GetInsertBlock() after emitting "
-                  "proxy task function is \n";
-  CGF.Builder.GetInsertBlock()->dump();
-  llvm::errs() << "LLVMDEBUG::SharedsTy is \n";
-  CharUnits cu = C.getTypeSizeInChars(SharedsTy);
-  llvm::errs() << "LLVMDEBUG::sizeof(SharedsTy) = \n";
-  llvm::errs() << cu.getQuantity() << "\n";
   // build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
   // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
   // kmp_routine_entry_t *task_entry);
@@ -9566,15 +9549,9 @@ static void emitTargetCallKernelLaunch(
   emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
   bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
                    llvm::codegenoptions::NoDebugInfo;
-  llvm::errs() << "LLVMDEBUG::After emitOffloadingArrays in "
-                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
-  OMPBuilder.Builder.GetInsertBlock()->dump();
   OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
                                           EmitDebug,
                                           /*ForEndCall=*/false);
-  llvm::errs() << "LLVMDEBUG::After emitOffloadingArraysArgument in "
-                  "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
-  OMPBuilder.Builder.GetInsertBlock()->dump();
 
   InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
   InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index be0717898ff25..565e13c2ad8c9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5395,9 +5395,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
     bool HasNoWait) {
 
-  LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
-                    << ", Builder = " << &Builder << "\n");
-
   BasicBlock *TargetTaskBodyBB =
       splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
   BasicBlock *TargetTaskAllocaBB =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 205065aef6488..29f2cbf611fa3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -445,12 +445,8 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
 }
 
 void BasicBlock::dropAllReferences() {
-  // bool debug_on = (this->getName() == "target.task.alloca");
-  for (Instruction &I : *this) {
-    LLVM_DEBUG(dbgs() << "Dropping all references in I = " << I << "\n");
+  for (Instruction &I : *this)
     I.dropAllReferences();
-    LLVM_DEBUG(dbgs() << "After Dropping all references in I = " << I << "\n");
-  }
 }
 
 const BasicBlock *BasicBlock::getSinglePredecessor() const {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index bb68755bea733..15259b46afe38 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -72,7 +72,6 @@
 #include <cstdint>
 #include <cstring>
 #include <string>
-#define DEBUG_TYPE "pranav"
 
 using namespace llvm;
 using ProfileCount = Function::ProfileCount;
@@ -551,15 +550,10 @@ void Function::stealArgumentListFrom(Function &Src) {
 
 void Function::deleteBodyImpl(bool ShouldDrop) {
   setIsMaterializable(false);
-  bool OldDebugFlag = DebugFlag;
-  if (this->getName() == "_QQmain..omp_par.1") {
-    DebugFlag = true;
-  }
-  for (BasicBlock &BB : *this) {
-    LLVM_DEBUG(dbgs() << "Dropping all references in " << BB << "\n");
+
+  for (BasicBlock &BB : *this)
     BB.dropAllReferences();
-    LLVM_DEBUG(dbgs() << "After Dropping all references in " << BB << "\n");
-  }
+
   // Delete all basic blocks. They are now unused, except possibly by
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
@@ -579,7 +573,6 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
     }
     setValueSubclassData(getSubclassDataFromValue() & ~0xe);
   }
-  DebugFlag = OldDebugFlag;
   // Metadata is stored in a side-table.
   clearMetadata();
 }
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 4d986ded06f11..f97dd18c736c5 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -539,10 +539,8 @@ std::string Module::getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id,
 // has "dropped all references", except operator delete.
 //
 void Module::dropAllReferences() {
-  for (Function &F : *this) {
-    // llvm::errs() << "Dropping all references in " << F.getName() << "\n";
+  for (Function &F : *this)
     F.dropAllReferences();
-  }
 
   for (GlobalVariable &GV : globals())
     GV.dropAllReferences();

>From 264dfa6be0544f325b8c480f0d521f12a3fed6fd Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 29 May 2024 15:43:49 -0500
Subject: [PATCH 09/24] Add an MLIR lit test

---
 mlir/test/Target/LLVMIR/omptarget-depend.mlir | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-depend.mlir

diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
new file mode 100644
index 0000000000000..c386342005e5e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = llvm.mlir.constant(39 : index) : i64
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %2 = llvm.mlir.constant(1 : index) : i64
+    %3 = llvm.mlir.constant(40 : index) : i64
+    %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr
+    %5 = llvm.mlir.addressof @_QFEb : !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %8 = llvm.mlir.addressof @_QFEn : !llvm.ptr
+    omp.task {
+      %14 = llvm.mlir.constant(1 : i64) : i64
+      %15 = llvm.alloca %14 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+      %16 = llvm.load %8 : !llvm.ptr -> i32
+      %17 = llvm.sext %16 : i32 to i64
+      %18 = llvm.trunc %2 : i64 to i32
+      llvm.br ^bb1(%18, %17 : i32, i64)
+    ^bb1(%19: i32, %20: i64):  // 2 preds: ^bb0, ^bb2
+      %21 = llvm.icmp "sgt" %20, %1 : i64
+      llvm.cond_br %21, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      llvm.store %19, %15 : i32, !llvm.ptr
+      %22 = llvm.load %15 : !llvm.ptr -> i32
+      %23 = llvm.sext %22 : i32 to i64
+      %24 = llvm.mlir.constant(1 : i64) : i64
+      %25 = llvm.mlir.constant(0 : i64) : i64
+      %26 = llvm.sub %23, %24 overflow<nsw> : i64
+      %27 = llvm.mul %26, %24 overflow<nsw> : i64
+      %28 = llvm.mul %27, %24 overflow<nsw> : i64
+      %29 = llvm.add %28, %25 overflow<nsw> : i64
+      %30 = llvm.mul %24, %3 overflow<nsw> : i64
+      %31 = llvm.getelementptr %4[%29] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      llvm.store %22, %31 : i32, !llvm.ptr
+      %32 = llvm.load %15 : !llvm.ptr -> i32
+      %33 = llvm.add %32, %18 : i32
+      %34 = llvm.sub %20, %2 : i64
+      llvm.br ^bb1(%33, %34 : i32, i64)
+    ^bb3:  // pred: ^bb1
+      llvm.store %19, %15 : i32, !llvm.ptr
+      omp.terminator
+    }
+    %9 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%3 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %10 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(to) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "a"}
+    %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"}
+    %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+    omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
+      %14 = llvm.mlir.constant(0 : index) : i64
+      %15 = llvm.mlir.constant(10 : i32) : i32
+      %16 = llvm.mlir.constant(1 : index) : i64
+      %17 = llvm.mlir.constant(40 : index) : i64
+      %18 = llvm.load %arg3 : !llvm.ptr -> i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.trunc %16 : i64 to i32
+      llvm.br ^bb1(%20, %19 : i32, i64)
+    ^bb1(%21: i32, %22: i64):  // 2 preds: ^bb0, ^bb2
+      %23 = llvm.icmp "sgt" %22, %14 : i64
+      llvm.cond_br %23, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      llvm.store %21, %arg2 : i32, !llvm.ptr
+      %24 = llvm.load %arg2 : !llvm.ptr -> i32
+      %25 = llvm.sext %24 : i32 to i64
+      %26 = llvm.mlir.constant(1 : i64) : i64
+      %27 = llvm.mlir.constant(0 : i64) : i64
+      %28 = llvm.sub %25, %26 overflow<nsw> : i64
+      %29 = llvm.mul %28, %26 overflow<nsw> : i64
+      %30 = llvm.mul %29, %26 overflow<nsw> : i64
+      %31 = llvm.add %30, %27 overflow<nsw> : i64
+      %32 = llvm.mul %26, %17 overflow<nsw> : i64
+      %33 = llvm.getelementptr %arg0[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      %34 = llvm.load %33 : !llvm.ptr -> i32
+      %35 = llvm.add %34, %15 : i32
+      %36 = llvm.mlir.constant(1 : i64) : i64
+      %37 = llvm.mlir.constant(0 : i64) : i64
+      %38 = llvm.sub %25, %36 overflow<nsw> : i64
+      %39 = llvm.mul %38, %36 overflow<nsw> : i64
+      %40 = llvm.mul %39, %36 overflow<nsw> : i64
+      %41 = llvm.add %40, %37 overflow<nsw> : i64
+      %42 = llvm.mul %36, %17 overflow<nsw> : i64
+      %43 = llvm.getelementptr %arg1[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+      llvm.store %35, %43 : i32, !llvm.ptr
+      %44 = llvm.load %arg2 : !llvm.ptr -> i32
+      %45 = llvm.add %44, %20 : i32
+      %46 = llvm.sub %22, %16 : i64
+      llvm.br ^bb1(%45, %46 : i32, i64)
+    ^bb3:  // pred: ^bb1
+      llvm.store %21, %arg2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEb() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEc() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+    llvm.return %0 : !llvm.array<40 x i32>
+  }
+  llvm.mlir.global internal @_QFEn() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(40 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+  llvm.func @_FortranAProgramEndStatement() attributes {sym_visibility = "private"}
+  llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.zero : !llvm.ptr
+    llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+    llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+    llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+    llvm.return %0 : i32
+  }
+
+// %strucArg holds pointers to shared data.
+// CHECK: define void @_QQmain() {
+// CHECK-DAG: %[[STRUCTARG:.+]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
+// CHECK: %[[PTR0:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 0
+// CHECK: store i64 ptrtoint (ptr @_QFEa to i64), ptr %[[PTR0]], align 4
+// CHECK: %[[PTR1:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 1
+// CHECK: store i64 8, ptr %[[PTR1]], align 4
+// CHECK: %[[PTR2:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 2
+// CHECK: store i8 1, ptr %[[PTR2]], align 1
+
+// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
+// CHECK: call void @__kmpc_omp_wait_deps({{.+}}, i32 1, ptr %[[DEP_ARRAY]], i32 0, ptr null)
+// CHECK: call void @__kmpc_omp_task_begin_if0({{.+}}, ptr  %[[TASKDATA]])
+// CHECK: call void @.omp_target_task_proxy_func({{.+}}, ptr %[[TASKDATA]])
+// CHECK: call void @__kmpc_omp_task_complete_if0({{.+}}, ptr %[[TASKDATA]])
+	      

>From 25021ebeca17e7268c36c85d0dd0172ff41925c5 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:08:29 -0500
Subject: [PATCH 10/24] add an end-to-end offloading test for target depend

---
 .../test/offloading/fortran/target-depend.f90 | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 offload/test/offloading/fortran/target-depend.f90

diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
new file mode 100644
index 0000000000000..6a05cf4c025e8
--- /dev/null
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -0,0 +1,40 @@
+! Offloading test checking interaction of fixed size
+! arrays with enter, exit and target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  integer :: a = 0
+  call foo(5, a)
+  print*, "======= FORTRAN Test passed! ======="
+  print*, "foo(5) returned ", a, ", expected 6\n"
+  !       stop 0
+end program main
+subroutine foo(N, r)
+  integer, intent(in) :: N
+  integer, intent(out) :: r
+  integer :: z
+
+  z = 1
+  !$omp task depend(out: z) shared(z)
+  ! print*, "N is ", N
+  ! print*, "z is ", z
+  z = N
+!  print*, "z is ", z
+  !$omp end task
+
+  !$omp target map(tofrom: z) depend(in: z)
+  z = z + 1
+  !$omp end target
+
+  r = z
+end subroutine foo
+
+!CHECK: ======= FORTRAN Test passed! =======
+!CHECK: foo(5) returned 6 , expected 6

>From e484ee2de917e3bed923a4f4a68d815900fd1c4f Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:38:12 -0500
Subject: [PATCH 11/24] Clean up, clean up, everybody clean up (some more)

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 16 +++--------
 clang/lib/CodeGen/CGStmt.cpp                  | 27 ------------------
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  7 -----
 clang/lib/CodeGen/CodeGenFunction.h           |  3 --
 clang/lib/Parse/ParseOpenMP.cpp               | 11 --------
 clang/lib/Sema/SemaOpenMP.cpp                 | 28 -------------------
 llvm/lib/IR/Function.cpp                      |  1 -
 llvm/lib/IR/Instruction.cpp                   |  5 ----
 mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp    |  3 +-
 .../test/offloading/fortran/target-depend.f90 |  3 --
 10 files changed, 5 insertions(+), 99 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3dfc6bccc8c05..f6d12d46cfc07 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -48,10 +48,6 @@
 using namespace clang;
 using namespace CodeGen;
 using namespace llvm::omp;
-// Experiment to make sanitizers easier to debug
-static llvm::cl::opt<bool> NewClangTargetTaskCodeGen(
-    "new-clang-target-task-codegen", llvm::cl::Optional,
-    llvm::cl::desc("new clang target task codegen."), llvm::cl::init(false));
 
 namespace {
 /// Base class for handling code generation inside OpenMP regions.
@@ -3707,7 +3703,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
       KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
       TaskPrivatesMap);
 
-  // build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
+  // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
   // kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
   // kmp_routine_entry_t *task_entry);
   // Task flags. Format is taken from
@@ -9624,13 +9620,9 @@ static void emitTargetCallKernelLaunch(
         DeviceID, RTLoc, AllocaIP));
   };
 
-  if (RequiresOuterTask) {
-    if (NewClangTargetTaskCodeGen) {
-      llvm::errs() << "Using OMPIRBuilder for target task codegen\n";
-    } else {
-      CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
-    }
-  } else
+  if (RequiresOuterTask)
+    CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
+  else
     OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
 }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 26baad23b87c5..99daaa14cf3fe 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -3135,12 +3135,6 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   const RecordDecl *RD = S.getCapturedRecordDecl();
   SourceLocation Loc = S.getBeginLoc();
   assert(CD->hasBody() && "missing CapturedDecl body");
-  llvm::errs() << "LLVMDEBUG:: In GenerateCapturedStmtFunction\n";
-  if (Builder.GetInsertBlock()) {
-    llvm::errs()
-        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, InsertBlock is \n";
-    Builder.GetInsertBlock()->dump();
-  }
 
   // Build the argument list.
   ASTContext &Ctx = CGM.getContext();
@@ -3162,13 +3156,6 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   // Generate the function.
   StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(),
                 CD->getBody()->getBeginLoc());
-  llvm::errs()
-      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After StartFunction\n";
-  if (Builder.GetInsertBlock()) {
-    llvm::errs()
-        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
-    Builder.GetInsertBlock()->getParent()->dump();
-  }
   // Set the context parameter in CapturedStmtInfo.
   Address DeclPtr = GetAddrOfLocalVar(CD->getContextParam());
   CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
@@ -3194,21 +3181,7 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   }
 
   PGO.assignRegionCounters(GlobalDecl(CD), F);
-  llvm::errs()
-      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: Before EmitBody\n";
-  if (Builder.GetInsertBlock()) {
-    llvm::errs()
-        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
-    Builder.GetInsertBlock()->getParent()->dump();
-  }
   CapturedStmtInfo->EmitBody(*this, CD->getBody());
-  llvm::errs()
-      << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After EmitBody\n";
-  if (Builder.GetInsertBlock()) {
-    llvm::errs()
-        << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
-    Builder.GetInsertBlock()->getParent()->dump();
-  }
   FinishFunction(CD->getBodyRBrace());
 
   return F;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 1cd3e72c38cc0..040b52a1101dd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,11 +5043,6 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
   Data.FirstprivateInits.emplace_back(InitRef);
   return OrigVD;
 }
-void CodeGenFunction::NewEmitOMPTargetTaskBasedDirective(
-    const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
-    OMPTargetDataInfo &InputInfo) {
-  EmitOMPTargetTaskBasedDirective(S, BodyGen, InputInfo);
-}
 void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
     OMPTargetDataInfo &InputInfo) {
@@ -5187,8 +5182,6 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
   llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
       S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, /*Tied=*/true,
       Data.NumberOfParts);
-  llvm::errs() << "LLVMDEBUG::Outlined Task Fn is \n";
-  OutlinedFn->dump();
   llvm::APInt TrueOrFalse(32, S.hasClausesOfKind<OMPNowaitClause>() ? 1 : 0);
   IntegerLiteral IfCond(getContext(), TrueOrFalse,
                         getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b80be8ed85458..45585361a4fc9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3797,9 +3797,6 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
                                        const RegionCodeGenTy &BodyGen,
                                        OMPTargetDataInfo &InputInfo);
-  void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
-                                          const RegionCodeGenTy &BodyGen,
-                                          OMPTargetDataInfo &InputInfo);
   void processInReduction(const OMPExecutableDirective &S,
                           OMPTaskDataTy &Data,
                           CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 547dd8fcf4552..33debdd3b1476 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2972,19 +2972,10 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       // FIXME: We create a bogus CompoundStmt scope to hold the contents of
       // the captured region. Code elsewhere assumes that any FunctionScopeInfo
       // should have at least one compound statement scope within it.
-      if (AssociatedStmt.get()) {
-        llvm::errs() << __FUNCTION__ << "Loc-1:\n";
-        AssociatedStmt.get()->dump();
-      }
       ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
       {
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
-        // Stmt *pdb_print = AssociatedStmt.get();
-        // if (pdb_print) {
-        //   llvm::errs() << __FUNCTION__ << "Loc0:\n";
-        //   pdb_print->dump();
-        // }
         if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
             getLangOpts().OpenMPIRBuilder)
           AssociatedStmt =
@@ -2992,8 +2983,6 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       }
       AssociatedStmt =
           Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
-      // llvm::errs() << __FUNCTION__ << "Loc1:\n";
-      // AssociatedStmt.get()->dump();
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
       Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 211b93a171dfe..b37a17d0e72a5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,19 +4854,6 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
 
   SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
   getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
-  // llvm::errs() << __FUNCTION__ << ": Loc0:\n";
-  // for (OpenMPDirectiveKind c : CaptureRegions) {
-  //   switch (c) {
-  //   case OMPD_task:
-  //     llvm::errs() << "OMPD_task\n";
-  //     break;
-  //   case OMPD_target:
-  //     llvm::errs() << "OMPD_target\n";
-  //     break;
-  //   default:
-  //     llvm::errs() << "default\n";
-  //   }
-  // }
   OMPOrderedClause *OC = nullptr;
   OMPScheduleClause *SC = nullptr;
   SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5018,11 +5005,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
     }
     if (++CompletedRegions == CaptureRegions.size())
       DSAStack->setBodyComplete();
-    // llvm::errs() << __FUNCTION__ << ": Loc1:\n";
-    // SR.get()->dump();
     SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
-    // llvm::errs() << __FUNCTION__ << ": Loc2:\n";
-    // SR.get()->dump();
   }
   return SR;
 }
@@ -6354,17 +6337,6 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
   llvm::SmallVector<OMPClause *> ClausesWithoutBind;
   bool UseClausesWithoutBind = false;
-  // if (Kind == Directive::OMPD_target) {
-  //   if (AStmt) {
-  //     llvm::errs() << __FUNCTION__ << "***********************\n";
-  //     AStmt->dump();
-  //     llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
-  //     AStmt->dumpPretty(getASTContext());
-  //   } else {
-  //     llvm::errs() << "__FUNCTION__"
-  //                  << ": AStmt is nullptr\n";
-  //   }
-  // }
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 15259b46afe38..74a6fa80f1f7f 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -64,7 +64,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ModRef.h"
 #include <cassert>
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 22f5b5a41fc3f..29272e627a1d1 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -128,11 +128,6 @@ void Instruction::insertAfter(Instruction *InsertPos) {
 BasicBlock::iterator Instruction::insertInto(BasicBlock *ParentBB,
                                              BasicBlock::iterator It) {
   assert(getParent() == nullptr && "Expected detached instruction");
-  if (!(It == ParentBB->end() || It->getParent() == ParentBB)) {
-    llvm::errs() << "ParentBB = " << *ParentBB << "\n";
-    llvm::errs() << "It = " << *It << "\n";
-    llvm::errs() << "It->getParent() = " << *It->getParent() << "\n";
-  }
   assert((It == ParentBB->end() || It->getParent() == ParentBB) &&
          "It not in ParentBB");
   insertBefore(*ParentBB, It);
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index c0c03df7cbc5d..4558893779534 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -18,7 +18,6 @@
 #include "mlir/Tools/mlir-translate/Translation.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
 
 using namespace mlir;
 
@@ -31,7 +30,7 @@ void registerToLLVMIRTranslation() {
         auto llvmModule = translateModuleToLLVMIR(op, llvmContext);
         if (!llvmModule)
           return failure();
-        llvm::verifyModule(*llvmModule);
+
         llvmModule->print(output, nullptr);
         return success();
       },
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index 6a05cf4c025e8..db58f2db6bbe9 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -23,10 +23,7 @@ subroutine foo(N, r)
 
   z = 1
   !$omp task depend(out: z) shared(z)
-  ! print*, "N is ", N
-  ! print*, "z is ", z
   z = N
-!  print*, "z is ", z
   !$omp end task
 
   !$omp target map(tofrom: z) depend(in: z)

>From febda4fbf1690d0dbe2bb639d106ef28b481bdff Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:41:27 -0500
Subject: [PATCH 12/24] Add back some ws that was removed

---
 clang/lib/CodeGen/CGStmtOpenMP.cpp | 1 +
 clang/lib/Parse/ParseOpenMP.cpp    | 1 +
 clang/lib/Sema/SemaOpenMP.cpp      | 1 +
 llvm/lib/IR/Function.cpp           | 2 ++
 4 files changed, 5 insertions(+)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 040b52a1101dd..6410f9e102c90 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,6 +5043,7 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
   Data.FirstprivateInits.emplace_back(InitRef);
   return OrigVD;
 }
+
 void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
     OMPTargetDataInfo &InputInfo) {
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 33debdd3b1476..e959dd6378f46 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2976,6 +2976,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       {
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
+
         if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
             getLangOpts().OpenMPIRBuilder)
           AssociatedStmt =
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index b37a17d0e72a5..bab61e8fd54e8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6337,6 +6337,7 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
   llvm::SmallVector<OMPClause *> ClausesWithoutBind;
   bool UseClausesWithoutBind = false;
+
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 74a6fa80f1f7f..13fa1afeaaff2 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -74,6 +74,7 @@
 
 using namespace llvm;
 using ProfileCount = Function::ProfileCount;
+
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<BasicBlock>;
@@ -572,6 +573,7 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
     }
     setValueSubclassData(getSubclassDataFromValue() & ~0xe);
   }
+
   // Metadata is stored in a side-table.
   clearMetadata();
 }

>From 9cc0234e9e9533883ecea4b71119e497e89d9e81 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:05:16 -0500
Subject: [PATCH 13/24] More cleanup and comments

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  14 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 172 ++++++++++++------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   2 +-
 3 files changed, 120 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8a67cd4b8d9f0..2ed130c87d40b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2249,15 +2249,8 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   /// instructions for passed in target arguments where neccessary
-
-  InsertPointTy newCreateTarget(
-      const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP,
-      OpenMPIRBuilder::InsertPointTy CodeGenIP,
-      TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads,
-      SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
-      TargetBodyGenCallbackTy BodyGenCB,
-      TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
-      SmallVector<DependData> Dependencies = {});
+  /// \param Dependencies A vector of DependData objects that carry
+  // dependency information as passed in the depend clause
   InsertPointTy createTarget(const LocationDescription &Loc,
                              OpenMPIRBuilder::InsertPointTy AllocaIP,
                              OpenMPIRBuilder::InsertPointTy CodeGenIP,
@@ -2266,7 +2259,8 @@ class OpenMPIRBuilder {
                              SmallVectorImpl<Value *> &Inputs,
                              GenMapInfoCallbackTy GenMapInfoCB,
                              TargetBodyGenCallbackTy BodyGenCB,
-                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
+                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+                             SmallVector<DependData> Dependencies = {});
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 565e13c2ad8c9..da2bf360cc8fd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -73,11 +73,6 @@ static cl::opt<double> UnrollThresholdFactor(
              "simplifications still taking place"),
     cl::init(1.5));
 
-static cl::opt<bool>
-    NewOMPIRBuilderTargetCodegen("new-ompirbuilder-target-codegen", cl::Hidden,
-                                 cl::desc("Use target-task based codegen."),
-                                 cl::init(false));
-
 #ifndef NDEBUG
 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
@@ -833,6 +828,9 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
   if (!OffloadInfoManager.empty())
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+  LLVM_DEBUG(dbgs() << "Module after OMPIRBuilder::finalize\n");
+  LLVM_DEBUG(dbgs() << M << "\n");
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -5301,15 +5299,18 @@ static Function *createOutlinedFunction(
   return Func;
 }
 
-// define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
-// %1) #3 {
+// Create an entry point for a target task with the following.
+// It'll have the following signature
+// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
+// This function is called from emitTargetTask once the
+// code to launch the target kernel has been outlined already.
 static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
                                        IRBuilderBase &Builder,
                                        CallInst *StaleCI) {
-  // Create a function with the following signature
-  // define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
-  // %1) #3 {
   Module &M = OMPBuilder.M;
+  // CalledFunction is the target launch function, i.e.
+  // the function that sets up kernel arguments and calls
+  // __tgt_target_kernel to launch the kernel on the device.
   Function *CalledFunction = StaleCI->getCalledFunction();
   OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
                                     StaleCI->getIterator());
@@ -5323,14 +5324,16 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
-  //  auto OldInsertPoint = Builder.saveIP();
 
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
   Builder.SetInsertPoint(EntryBB);
 
   bool HasShareds = StaleCI->arg_size() > 1;
-  // PDB: Temporary assert.
+  // TODO: This is a temporary assert to prove to ourselves that
+  // the outlined target launch function is always going to have
+  // atmost two arguments if there is any data shared between
+  // host and device.
   assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
          "StaleCI with shareds should have exactly two arguments.");
   if (HasShareds) {
@@ -5363,12 +5366,9 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
 
     Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
   }
-  // CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
-  // CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
   ProxyFn->getArg(0)->setName("thread.id");
   ProxyFn->getArg(1)->setName("task");
   Builder.CreateRetVoid();
-  //  Builder.restoreIP(OldInsertPoint);
   return ProxyFn;
 }
 static void emitTargetOutlinedFunction(
@@ -5395,6 +5395,87 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
     bool HasNoWait) {
 
+  // When we arrive at this function, the target region itself has been
+  // outlined into the function OutlinedFn.
+  // So at ths point, for
+  // --------------------------------------------------
+  //   void user_code_that_offloads(...) {
+  //     omp target depend(..) map(from:a) map(to:b, c)
+  //        a = b + c
+  //   }
+  //
+  // --------------------------------------------------
+  //
+  // we have
+  //
+  // --------------------------------------------------
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     ;; target region has been outlined and now we need to
+  //     ;; offload to it via a target task.
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //     *a = *b + *c
+  //   }
+  //
+  // We have to now do the following
+  // (i)   Make an offloading call to outlined_device_function using the OpenMP RTL
+  //       See 'kernel_launch_function' in the pseudo code below. This is emitted by
+  //       emitKernelLaunch
+  // (ii)  Create a task entry point function that calls kernel_launch_function and
+  //       is the entry point for the target task. See '@.omp_target_task_proxy_func
+  //       in the pseudocode below.
+  // (iii) Create a task with the task entry point created in (ii)
+  //
+  // That is we create the following
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //
+  //     %structArg = alloca { ptr, ptr, ptr }, align 8
+  //     %strucArg[0] = %.offload_baseptrs
+  //     %strucArg[1] = %.offload_ptrs
+  //     %strucArg[2] = %.offload_mappers
+  //     proxy_target_task = @__kmpc_omp_task_alloc(...,  @.omp_target_task_proxy_func)
+  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     dependencies_array = alloca [
+  //     ;; if nowait not present
+  //     call @__kmpc_omp_wait_deps(..., dependencies_array)
+  //     call @__kmpc_omp_task_begin_if0(...)
+  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr %proxy_target_task)
+  //     call @__kmpc_omp_task_complete_if0(...)
+  //   }
+  //
+  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) {
+  //       %structArg = alloca {ptr, ptr, ptr}
+  //       %shared_data = load (getelementptr %task, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(structArg))
+  //       kernel_launch_function(%thread.id, %structArg)
+  //   }
+  //
+  //   We need the proxy function because the signature of the task entry point expected
+  //   by kmpc_omp_task is always the same and will be different from that of the
+  //   kernel_launch function.
+  //
+  //   kernel_launch_function is generated by emitKernelLaunch and has the always_inline
+  //   attribute.
+  //   void kernel_launch_function(thread_id, structArg) alwaysinline {
+  //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
+  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
+  //       offload_ptrs = load(getelementptr structArg, 0, 1)
+  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and offload_mappers
+  //       call i32 @__tgt_target_kernel(..., outlined_device_function, ptr %kernel_args)
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //      *a = *b + *c
+  //   }
+  //
   BasicBlock *TargetTaskBodyBB =
       splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
   BasicBlock *TargetTaskAllocaBB =
@@ -5417,12 +5498,14 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   Builder.restoreIP(TargetTaskBodyIP);
 
   // emitKernelLaunch makes the necessary runtime call to offload the kernel.
-  // We then outline all that code into a separate function that is called
-  // by the task wrapper function (aka Proxy task function - see
-  // emitProxyTaskFunction)
+  // We then outline all that code into a separate function ('kernel_launch_function' in
+  // the pseudo code above). This function is then called by the target task proxy
+  // function (see '@.omp_target_task_proxy_func' in the pseudo code above)
+  // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
   Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
                                      EmitTargetCallFallbackCB, Args, DeviceID,
                                      RTLoc, TargetTaskAllocaIP));
+
   OI.ExitBB = Builder.saveIP().getBlock();
   OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
                       HasNoWait](Function &OutlinedFn) mutable {
@@ -5439,6 +5522,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
                       << "\n");
 
     Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+
     LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
                       << "\n");
 
@@ -5546,13 +5630,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     }
 
     StaleCI->eraseFromParent();
-    // Builder.SetInsertPoint(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
-    // if (HasShareds) {
-    //   LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
-    //   OutlinedFn.getArg(1)->replaceUsesWithIf(
-    //       Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
-    // }
-
     while (!ToBeDeleted.empty()) {
       ToBeDeleted.top()->eraseFromParent();
       ToBeDeleted.pop();
@@ -5583,8 +5660,6 @@ static void emitTargetCall(
   OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
                                   /*IsNonContiguous=*/true);
 
-  LLVM_DEBUG(dbgs() << "OMPBuilder.Builder = " << &OMPBuilder.Builder
-                    << ", Builder = " << &Builder << "\n");
   OpenMPIRBuilder::TargetDataRTArgs RTArgs;
   OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
                                           !MapInfo.Names.empty());
@@ -5632,7 +5707,10 @@ static void emitTargetCall(
   //   make task call
   // }
   //
-  if (NewOMPIRBuilderTargetCodegen && RequiresOuterTargetTask) {
+
+  // The presence of certain clauses on the target directive require the explicit
+  // generation of the target task.
+  if (RequiresOuterTargetTask) {
     OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
                               EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
                               AllocaIP, Dependencies, HasNoWait);
@@ -5642,7 +5720,7 @@ static void emitTargetCall(
         DeviceID, RTLoc, AllocaIP));
   }
 }
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
     int32_t NumThreads, SmallVectorImpl<Value *> &Args,
@@ -5650,33 +5728,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
     SmallVector<DependData> Dependencies) {
-  if (!NewOMPIRBuilderTargetCodegen) {
-    LLVM_DEBUG(dbgs() << "Old OpenMPIRBuilder target codegen\n");
-    return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
-                        NumThreads, Args, GenMapInfoCB, CBFunc,
-                        ArgAccessorFuncCB);
-  }
-  LLVM_DEBUG(dbgs() << "New OpenMPIRBuilder target codegen\n");
-  if (!updateToLocation(Loc))
-    return InsertPointTy();
 
-  Builder.restoreIP(CodeGenIP);
-  Function *OutlinedFn;
-  Constant *OutlinedFnID;
-  emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
-                             OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
-  if (!Config.isTargetDevice())
-    emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
-                   NumThreads, Args, GenMapInfoCB, Dependencies);
-  return Builder.saveIP();
-}
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
-    const LocationDescription &Loc, InsertPointTy AllocaIP,
-    InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
-    int32_t NumThreads, SmallVectorImpl<Value *> &Args,
-    GenMapInfoCallbackTy GenMapInfoCB,
-    OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
-    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -5684,12 +5736,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
 
   Function *OutlinedFn;
   Constant *OutlinedFnID;
+  // The target region is outlined into its own function. The LLVM IR for
+  // the target region itself is generated using the callbacks CBFunc
+  // and ArgAccessorFuncCB
   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
                              OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+
+  // If we are not on the target device, then we need to generate code
+  // to make a remote call (offload) to the previously outlined function
+  // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
-                   NumThreads, Args, GenMapInfoCB);
-
+                   NumThreads, Args, GenMapInfoCB, Dependencies);
   return Builder.saveIP();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 022ea3af7f58a..74f34e227d9f0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3097,7 +3097,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
                     moduleTranslation, dds);
 
-  builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
+  builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
       defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
       dds));

>From 141dccc06f933a737b7991add8dc466e831e45e9 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:11:49 -0500
Subject: [PATCH 14/24] clang-format fixes

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 55 +++++++++++++----------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index da2bf360cc8fd..46e68e8e0e6ca 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5422,12 +5422,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   //   }
   //
   // We have to now do the following
-  // (i)   Make an offloading call to outlined_device_function using the OpenMP RTL
-  //       See 'kernel_launch_function' in the pseudo code below. This is emitted by
-  //       emitKernelLaunch
-  // (ii)  Create a task entry point function that calls kernel_launch_function and
-  //       is the entry point for the target task. See '@.omp_target_task_proxy_func
-  //       in the pseudocode below.
+  // (i)   Make an offloading call to outlined_device_function using the OpenMP
+  //       RTL. See 'kernel_launch_function' in the pseudo code below. This is
+  //       emitted by emitKernelLaunch
+  // (ii)  Create a task entry point function that calls kernel_launch_function
+  //       and is the entry point for the target task. See
+  //       '@.omp_target_task_proxy_func in the pseudocode below.
   // (iii) Create a task with the task entry point created in (ii)
   //
   // That is we create the following
@@ -5441,36 +5441,42 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   //     %strucArg[0] = %.offload_baseptrs
   //     %strucArg[1] = %.offload_ptrs
   //     %strucArg[2] = %.offload_mappers
-  //     proxy_target_task = @__kmpc_omp_task_alloc(...,  @.omp_target_task_proxy_func)
+  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
+  //                                               @.omp_target_task_proxy_func)
   //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
-  //     dependencies_array = alloca [
+  //     dependencies_array = ...
   //     ;; if nowait not present
   //     call @__kmpc_omp_wait_deps(..., dependencies_array)
   //     call @__kmpc_omp_task_begin_if0(...)
-  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr %proxy_target_task)
-  //     call @__kmpc_omp_task_complete_if0(...)
+  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
+  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
   //   }
   //
-  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) {
+  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
+  //                                                     ptr %task) {
   //       %structArg = alloca {ptr, ptr, ptr}
   //       %shared_data = load (getelementptr %task, 0, 0)
   //       mempcy(%structArg, %shared_data, sizeof(structArg))
   //       kernel_launch_function(%thread.id, %structArg)
   //   }
   //
-  //   We need the proxy function because the signature of the task entry point expected
-  //   by kmpc_omp_task is always the same and will be different from that of the
-  //   kernel_launch function.
+  //   We need the proxy function because the signature of the task entry point
+  //   expected by kmpc_omp_task is always the same and will be different from
+  //   that of the kernel_launch function.
   //
-  //   kernel_launch_function is generated by emitKernelLaunch and has the always_inline
-  //   attribute.
-  //   void kernel_launch_function(thread_id, structArg) alwaysinline {
+  //   kernel_launch_function is generated by emitKernelLaunch and has the
+  //   always_inline attribute. void kernel_launch_function(thread_id,
+  //                                                        structArg)
+  //                                                        alwaysinline {
   //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
   //       offload_baseptrs = load(getelementptr structArg, 0, 0)
   //       offload_ptrs = load(getelementptr structArg, 0, 1)
   //       offload_mappers = load(getelementptr structArg, 0, 2)
-  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and offload_mappers
-  //       call i32 @__tgt_target_kernel(..., outlined_device_function, ptr %kernel_args)
+  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
+  //       ; offload_mappers
+  //       call i32 @__tgt_target_kernel(...,
+  //                                     outlined_device_function,
+  //                                     ptr %kernel_args)
   //   }
   //   void outlined_device_function(ptr a, ptr b, ptr c) {
   //      *a = *b + *c
@@ -5498,9 +5504,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   Builder.restoreIP(TargetTaskBodyIP);
 
   // emitKernelLaunch makes the necessary runtime call to offload the kernel.
-  // We then outline all that code into a separate function ('kernel_launch_function' in
-  // the pseudo code above). This function is then called by the target task proxy
-  // function (see '@.omp_target_task_proxy_func' in the pseudo code above)
+  // We then outline all that code into a separate function
+  // ('kernel_launch_function' in the pseudo code above). This function is then
+  // called by the target task proxy function (see
+  // '@.omp_target_task_proxy_func' in the pseudo code above)
   // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
   Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
                                      EmitTargetCallFallbackCB, Args, DeviceID,
@@ -5708,8 +5715,8 @@ static void emitTargetCall(
   // }
   //
 
-  // The presence of certain clauses on the target directive require the explicit
-  // generation of the target task.
+  // The presence of certain clauses on the target directive require the
+  // explicit generation of the target task.
   if (RequiresOuterTargetTask) {
     OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
                               EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,

>From 6697f1e266a9ea78062237829ed8ec2c7725a36c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:13:29 -0500
Subject: [PATCH 15/24] remove commented out createFakeIntVal

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 36 -----------------------
 1 file changed, 36 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 46e68e8e0e6ca..cf6fcdca9294f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -389,42 +389,6 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
   ToBeDeleted.push(UseFakeVal);
   return FakeVal;
 }
-// // This function creates a fake integer value and a fake use for the integer
-// // value. It returns the fake value created. This is useful in modeling the
-// // extra arguments to the outlined functions.
-// Value *createFakeIntVal(IRBuilder<> &Builder,
-//                         OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
-//                         std::stack<Instruction *> &ToBeDeleted,
-//                         OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
-//                         const Twine &Name = "", bool AsPtr = true) {
-//   Builder.restoreIP(OuterAllocaIP);
-//   Instruction *FakeVal;
-//   AllocaInst *FakeValAddr =
-//       Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
-//   ToBeDeleted.push(FakeValAddr);
-
-//   if (AsPtr) {
-//     FakeVal = FakeValAddr;
-//   } else {
-//     FakeVal =
-//         Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
-//     ToBeDeleted.push(FakeVal);
-//   }
-
-//   // Generate a fake use of this value
-//   Builder.restoreIP(InnerAllocaIP);
-//   Instruction *UseFakeVal;
-//   if (AsPtr) {
-//     UseFakeVal =
-//         Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
-//   } else {
-//     UseFakeVal =
-//         cast<BinaryOperator>(Builder.CreateAdd(FakeVal,
-//         Builder.getInt32(10)));
-//   }
-//   ToBeDeleted.push(UseFakeVal);
-//   return FakeVal;
-// }
 
 //===----------------------------------------------------------------------===//
 // OpenMPIRBuilderConfig

>From e75c8536219bd21cd7e653186b8520d2ea87c359 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:16:32 -0500
Subject: [PATCH 16/24] more cleanup

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf6fcdca9294f..f2ab1ad18e694 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5666,18 +5666,6 @@ static void emitTargetCall(
   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
                                           NumTeamsVal, NumThreadsVal,
                                           DynCGGroupMem, HasNoWait);
-  // PDB: here you'll have to break the logic down to do the following
-  // if (!requiresoutertask) {
-  //    Builder.restoreIP(OMPBuilder.emitKernelLaunch(
-  //       Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
-  //       DeviceID, RTLoc, AllocaIP));
-  // else {
-  //   codegen_callback = codegen callback to create task logic which should be
-  //   received from openmptollvmirtranslation + emitkernellaunch
-  //   create_task(codegen_callback)
-  //   make task call
-  // }
-  //
 
   // The presence of certain clauses on the target directive require the
   // explicit generation of the target task.

>From 799751fa9aa71b0e9e75be60ecbdea6e6aba1200 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 23:31:54 -0500
Subject: [PATCH 17/24] Fix dependencies when nowait is used on target
 construct

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 36 ++++++++++++++++-------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f2ab1ad18e694..70dcb5adef08c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1703,6 +1703,10 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
 static Value *
 emitDepArray(OpenMPIRBuilder &OMPBuilder,
              SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+  // Early return if we have no dependencies to process
+  if (!Dependencies.size())
+    return nullptr;
+
   IRBuilderBase &Builder = OMPBuilder.Builder;
   Type *DependInfo = OMPBuilder.DependInfo;
   Module &M = OMPBuilder.M;
@@ -5561,16 +5565,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
                            SharedsSize);
     }
-    if (Dependencies.size()) {
-      Value *DepArray = emitDepArray(*this, Dependencies);
-      Function *TaskWaitFn =
-          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
-      Builder.CreateCall(
-          TaskWaitFn,
-          {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
-           ConstantInt::get(Builder.getInt32Ty(), 0),
-           ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
-    }
+
+    Value *DepArray = emitDepArray(*this, Dependencies);
 
     // ---------------------------------------------------------------
     // V5.2 13.8 target construct
@@ -5581,6 +5577,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     // The above means that the lack of a nowait on the target construct
     // translates to '#pragma omp task if(0)'
     if (!HasNoWait) {
+      if (DepArray) {
+        Function *TaskWaitFn =
+            getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+        Builder.CreateCall(
+            TaskWaitFn,
+            {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
+             ConstantInt::get(Builder.getInt32Ty(), 0),
+             ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+      }
       // Included task.
       Function *TaskBeginFn =
           getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
@@ -5594,6 +5599,17 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
         CI = Builder.CreateCall(ProxyFn, {ThreadID});
       CI->setDebugLoc(StaleCI->getDebugLoc());
       Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
+    } else if (DepArray) {
+      // HasNoWait - meaning the task may be deferred. Call
+      // __kmpc_omp_task_with_deps if there are dependencies,
+      // else call __kmpc_omp_task
+      Function *TaskFn =
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
+      Builder.CreateCall(
+          TaskFn,
+          {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
+           DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
+           ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
     } else {
       // Emit the @__kmpc_omp_task runtime call to spawn the task
       Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);

>From 77e5753582539cba299ab1410c5b966bba92f63c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 31 May 2024 10:18:39 -0500
Subject: [PATCH 18/24] Add comments for emitTargetTask and emitDepArray

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 16 ++++++++++++++++
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp        |  6 ++++++
 2 files changed, 22 insertions(+)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2ed130c87d40b..d028820f16c91 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1761,11 +1761,27 @@ class OpenMPIRBuilder {
       const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
+
+  /// Generate a target-task for the target construct
+  ///
+  /// \param OutlinedFn The outlined device/target kernel function.
+  /// \param OutlinedFnID The ooulined function ID.
+  /// \param EmitTargetCallFallbackCB Call back function to generate host
+  ///        fallback code.
+  /// \param Args Data structure holding information about the kernel arguments.
+  /// \param DeviceID Identifier for the device via the 'device' clause.
+  /// \param RTLoc Source location identifier
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param Dependencies Vector of DependData objects holding information of
+  ///        dependencies as specified by the 'depend' clause.
+  /// \param HasNoWait True if the target construct had 'nowait' on it, false
+  ///        otherwise
   InsertPointTy emitTargetTask(
       Function *OutlinedFn, Value *OutlinedFnID,
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
       SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
+
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 70dcb5adef08c..5e084a5acb6f2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1700,6 +1700,12 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
     return;
   emitTaskyieldImpl(Loc);
 }
+
+// Processes the dependencies in Dependencies and does the following
+// - Allocates space on the stack of an array of DependInfo objects
+// - Populates each DependInfo object with relevant information of
+//   the corresponding dependence.
+// - All code is inserted in the entry block of the current function.
 static Value *
 emitDepArray(OpenMPIRBuilder &OMPBuilder,
              SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {

>From 073d194e9800cea748b186b27950ff9862eaf5f3 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 31 May 2024 10:28:56 -0500
Subject: [PATCH 19/24] Fix comment in target-depend.f90

---
 offload/test/offloading/fortran/target-depend.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index db58f2db6bbe9..1c5ab1efcfbdd 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -1,5 +1,5 @@
-! Offloading test checking interaction of fixed size
-! arrays with enter, exit and target
+! Offloading test checking the use of the depend clause on
+! the target construct
 ! REQUIRES: flang, amdgcn-amd-amdhsa
 ! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO

>From 29566b122a154739e2591ec6c6ac3ea487e63ac1 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 11 Jun 2024 11:53:36 -0500
Subject: [PATCH 20/24] Incorporate changes for review comments

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 190 ++++++++++--------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   3 +-
 .../test/offloading/fortran/target-depend.f90 |  22 +-
 3 files changed, 121 insertions(+), 94 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5e084a5acb6f2..7a62f00952640 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -792,9 +792,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
   if (!OffloadInfoManager.empty())
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
-
-  LLVM_DEBUG(dbgs() << "Module after OMPIRBuilder::finalize\n");
-  LLVM_DEBUG(dbgs() << M << "\n");
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -1707,55 +1704,65 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
 //   the corresponding dependence.
 // - All code is inserted in the entry block of the current function.
 static Value *
-emitDepArray(OpenMPIRBuilder &OMPBuilder,
-             SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+emitTaskDependencies(OpenMPIRBuilder &OMPBuilder,
+                     SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
   // Early return if we have no dependencies to process
-  if (!Dependencies.size())
+  if (Dependencies.empty())
     return nullptr;
 
+  // Given a vector of DependData objects, in this function we create an
+  // array on the stack that holds kmp_dep_info objects corresponding
+  // to each dependency. This is then passed to the OpenMP runtime.
+  // For example, if there are 'n' dependencies then the following psedo
+  // code is generated. Assume the first dependence is on a variable 'a'
+  //
+  // \code{c}
+  // DepArray = alloc(n x sizeof(kmp_depend_info);
+  // idx = 0;
+  // DepArray[idx].base_addr = ptrtoint(&a);
+  // DepArray[idx].len = 8;
+  // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
+  // ++idx;
+  // DepArray[idx].base_addr = ...;
+  // \endcode
+
   IRBuilderBase &Builder = OMPBuilder.Builder;
   Type *DependInfo = OMPBuilder.DependInfo;
   Module &M = OMPBuilder.M;
 
   Value *DepArray = nullptr;
-  if (Dependencies.size()) {
-    OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
-    Builder.SetInsertPoint(
-        &OldIP.getBlock()->getParent()->getEntryBlock().back());
-
-    Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
-    DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
-
-    unsigned P = 0;
-    for (const OpenMPIRBuilder::DependData &Dep : Dependencies) {
-      Value *Base =
-          Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
-      // Store the pointer to the variable
-      Value *Addr = Builder.CreateStructGEP(
-          DependInfo, Base,
-          static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
-      Value *DepValPtr =
-          Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
-      Builder.CreateStore(DepValPtr, Addr);
-      // Store the size of the variable
-      Value *Size = Builder.CreateStructGEP(
-          DependInfo, Base,
-          static_cast<unsigned int>(RTLDependInfoFields::Len));
-      Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize(
-                              Dep.DepValueType)),
-                          Size);
-      // Store the dependency kind
-      Value *Flags = Builder.CreateStructGEP(
-          DependInfo, Base,
-          static_cast<unsigned int>(RTLDependInfoFields::Flags));
-      Builder.CreateStore(
-          ConstantInt::get(Builder.getInt8Ty(),
-                           static_cast<unsigned int>(Dep.DepKind)),
-          Flags);
-      ++P;
-    }
-    Builder.restoreIP(OldIP);
-  }
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  Builder.SetInsertPoint(
+      OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
+
+  Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
+  DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
+
+  for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
+    Value *Base =
+        Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
+    // Store the pointer to the variable
+    Value *Addr = Builder.CreateStructGEP(
+        DependInfo, Base,
+        static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
+    Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
+    Builder.CreateStore(DepValPtr, Addr);
+    // Store the size of the variable
+    Value *Size = Builder.CreateStructGEP(
+        DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
+    Builder.CreateStore(
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
+        Size);
+    // Store the dependency kind
+    Value *Flags = Builder.CreateStructGEP(
+        DependInfo, Base,
+        static_cast<unsigned int>(RTLDependInfoFields::Flags));
+    Builder.CreateStore(
+        ConstantInt::get(Builder.getInt8Ty(),
+                         static_cast<unsigned int>(Dep.DepKind)),
+        Flags);
+  }
+  Builder.restoreIP(OldIP);
   return DepArray;
 }
 
@@ -5273,19 +5280,37 @@ static Function *createOutlinedFunction(
   return Func;
 }
 
-// Create an entry point for a target task with the following.
-// It'll have the following signature
-// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
-// This function is called from emitTargetTask once the
-// code to launch the target kernel has been outlined already.
-static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
-                                       IRBuilderBase &Builder,
-                                       CallInst *StaleCI) {
+/// Create an entry point for a target task with the following.
+/// It'll have the following signature
+/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
+/// This function is called from emitTargetTask once the
+/// code to launch the target kernel has been outlined already.
+static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
+                                             IRBuilderBase &Builder,
+                                             CallInst *StaleCI) {
   Module &M = OMPBuilder.M;
-  // CalledFunction is the target launch function, i.e.
+  // KernelLaunchFunction is the target launch function, i.e.
   // the function that sets up kernel arguments and calls
   // __tgt_target_kernel to launch the kernel on the device.
-  Function *CalledFunction = StaleCI->getCalledFunction();
+  //
+  Function *KernelLaunchFunction = StaleCI->getCalledFunction();
+
+  // StaleCI is the CallInst which is the call to the outlined
+  // target kernel launch function. If there are values that the
+  // outlined function uses then these are aggregated into a structure
+  // which is passed as the second argument. If not, then there's
+  // only one argument, the threadID. So, StaleCI can be
+  //
+  // %structArg = alloca { ptr, ptr }, align 8
+  // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
+  // store ptr %20, ptr %gep_, align 8
+  // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
+  // store ptr %21, ptr %gep_8, align 8
+  // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
+  //
+  // OR
+  //
+  // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
   OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
                                     StaleCI->getIterator());
   LLVMContext &Ctx = StaleCI->getParent()->getContext();
@@ -5298,6 +5323,8 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
+  ProxyFn->getArg(0)->setName("thread.id");
+  ProxyFn->getArg(1)->setName("task");
 
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
@@ -5311,20 +5338,17 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
   assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
          "StaleCI with shareds should have exactly two arguments.");
   if (HasShareds) {
-    AllocaInst *ArgStructAlloca =
-        dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
     assert(ArgStructAlloca &&
            "Unable to find the alloca instruction corresponding to arguments "
            "for extracted function");
-    StructType *ArgStructType =
+    auto *ArgStructType =
         dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
-    LLVM_DEBUG(dbgs() << "ArgStructType = " << *ArgStructType << "\n");
 
     AllocaInst *NewArgStructAlloca =
         Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
     Value *TaskT = ProxyFn->getArg(1);
     Value *ThreadId = ProxyFn->getArg(0);
-    LLVM_DEBUG(dbgs() << "TaskT = " << *TaskT << "\n");
     Value *SharedsSize =
         Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
 
@@ -5332,16 +5356,12 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
     LoadInst *LoadShared =
         Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
 
-    // TODO: Are these alignment values correct?
     Builder.CreateMemCpy(
-        NewArgStructAlloca,
-        NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), LoadShared,
+        NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
         LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
 
-    Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
+    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
   }
-  ProxyFn->getArg(0)->setName("thread.id");
-  ProxyFn->getArg(1)->setName("task");
   Builder.CreateRetVoid();
   return ProxyFn;
 }
@@ -5439,9 +5459,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   //   that of the kernel_launch function.
   //
   //   kernel_launch_function is generated by emitKernelLaunch and has the
-  //   always_inline attribute. void kernel_launch_function(thread_id,
-  //                                                        structArg)
-  //                                                        alwaysinline {
+  //   always_inline attribute.
+  //   void kernel_launch_function(thread_id,
+  //                               structArg) alwaysinline {
   //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
   //       offload_baseptrs = load(getelementptr structArg, 0, 0)
   //       offload_ptrs = load(getelementptr structArg, 0, 1)
@@ -5482,7 +5502,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   // ('kernel_launch_function' in the pseudo code above). This function is then
   // called by the target task proxy function (see
   // '@.omp_target_task_proxy_func' in the pseudo code above)
-  // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
+  // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction
   Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
                                      EmitTargetCallFallbackCB, Args, DeviceID,
                                      RTLoc, TargetTaskAllocaIP));
@@ -5496,20 +5516,14 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
     bool HasShareds = StaleCI->arg_size() > 1;
 
-    LLVM_DEBUG(dbgs() << "StaleCI in PostOutlineCB in emitTargetTask = "
-                      << *StaleCI << "\n");
-    LLVM_DEBUG(dbgs() << "Module in PostOutlineCB in emitTargetTask = "
-                      << *(StaleCI->getParent()->getParent()->getParent())
-                      << "\n");
-
-    Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
 
     LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
                       << "\n");
 
     Builder.SetInsertPoint(StaleCI);
 
-    // Gather the arguments for emitting the runtime call for
+    // Gather the arguments for emitting the runtime call.
     uint32_t SrcLocStrSize;
     Constant *SrcLocStr =
         getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
@@ -5527,20 +5541,19 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
     // Tasksize refers to the size in bytes of kmp_task_t data structure
     // including private vars accessed in task.
     // TODO: add kmp_task_t_with_privates (privates)
-    Value *TaskSize = Builder.getInt64(
-        divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
+    Value *TaskSize =
+        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
 
     // Argument - `sizeof_shareds` (SharedsSize)
     // SharedsSize refers to the shareds array size in the kmp_task_t data
     // structure.
     Value *SharedsSize = Builder.getInt64(0);
     if (HasShareds) {
-      AllocaInst *ArgStructAlloca =
-          dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
       assert(ArgStructAlloca &&
              "Unable to find the alloca instruction corresponding to arguments "
              "for extracted function");
-      StructType *ArgStructType =
+      auto *ArgStructType =
           dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
       assert(ArgStructType && "Unable to find struct type corresponding to "
                               "arguments for extracted function");
@@ -5572,7 +5585,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
                            SharedsSize);
     }
 
-    Value *DepArray = emitDepArray(*this, Dependencies);
+    Value *DepArray = emitTaskDependencies(*this, Dependencies);
 
     // ---------------------------------------------------------------
     // V5.2 13.8 target construct
@@ -5588,8 +5601,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
             getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
         Builder.CreateCall(
             TaskWaitFn,
-            {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
-             ConstantInt::get(Builder.getInt32Ty(), 0),
+            {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
+             /*ndeps=*/Builder.getInt32(Dependencies.size()),
+             /*dep_list=*/DepArray,
+             /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
+             /*noalias_dep_list=*/
              ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
       }
       // Included task.
@@ -5660,8 +5676,6 @@ static void emitTargetCall(
   //  emitKernelLaunch
   auto &&EmitTargetCallFallbackCB =
       [&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy {
-    LLVM_DEBUG(dbgs() << "EmitTargetCallFallbackCB::Builder = " << &Builder
-                      << "\n");
     Builder.restoreIP(IP);
     Builder.CreateCall(OutlinedFn, Args);
     return Builder.saveIP();
@@ -5999,8 +6013,6 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     return;
 
   Builder.restoreIP(AllocaIP);
-  LLVM_DEBUG(dbgs() << "Basicblock before emitOffloadingArrays\n"
-                    << *(Builder.GetInsertBlock()) << "\n");
   // Detect if we have any capture size requiring runtime evaluation of the
   // size so that a constant array could be eventually used.
   ArrayType *PointerArrayType =
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 74f34e227d9f0..e324730c39a17 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -681,10 +681,11 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
       ompLoc, bodyCB, numTeamsLower, numTeamsUpper, threadLimit, ifExpr));
   return bodyGenStatus;
 }
+
 static void
 buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
                 LLVM::ModuleTranslation &moduleTranslation,
-                SmallVector<llvm::OpenMPIRBuilder::DependData> &dds) {
+                SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
   for (auto dep : llvm::zip(dependVars, depends->getValue())) {
     llvm::omp::RTLDependenceKindTy type;
     switch (
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index 1c5ab1efcfbdd..cd256fa3f2164 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -13,25 +13,39 @@ program main
   integer :: a = 0
   call foo(5, a)
   print*, "======= FORTRAN Test passed! ======="
-  print*, "foo(5) returned ", a, ", expected 6\n"
+  print*, "foo(5) returned ", a, ", expected 8\n"
   !       stop 0
 end program main
 subroutine foo(N, r)
   integer, intent(in) :: N
   integer, intent(out) :: r
-  integer :: z
-
+  integer :: z, i
   z = 1
+  ! Spawn 3 threads
+  !$omp parallel num_threads(3)
+
+  ! Each thread redundantly updates z to N
+  ! i.e. 5
   !$omp task depend(out: z) shared(z)
+  do while (i < 32766)
+     ! dumb loop to slow down the update of
+     ! z
+     i = i + 1
+  end do
   z = N
   !$omp end task
 
+  ! z is 5 now. Each thread then offloads
+  ! increment of z by 1. So, z is incremented
+  ! three times.
   !$omp target map(tofrom: z) depend(in: z)
   z = z + 1
   !$omp end target
+  !$omp end parallel
 
+  ! z is 8.
   r = z
 end subroutine foo
 
 !CHECK: ======= FORTRAN Test passed! =======
-!CHECK: foo(5) returned 6 , expected 6
+!CHECK: foo(5) returned 8 , expected 8

>From a12ffb476c8f28a975d3e565df96bdb14f5ae961 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 21 Jun 2024 00:02:02 -0500
Subject: [PATCH 21/24] Update testcase to make it more reliable by correcting
 the use of the depend clause (sibling tasks)

---
 .../test/offloading/fortran/target-depend.f90 | 92 +++++++++++++------
 1 file changed, 62 insertions(+), 30 deletions(-)

diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index cd256fa3f2164..e7729a8c31e6b 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -10,42 +10,74 @@
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
+  implicit none
   integer :: a = 0
+  INTERFACE
+     FUNCTION omp_get_device_num() BIND(C)
+       USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
+       integer :: omp_get_device_num
+     END FUNCTION omp_get_device_num
+  END INTERFACE
+
   call foo(5, a)
   print*, "======= FORTRAN Test passed! ======="
-  print*, "foo(5) returned ", a, ", expected 8\n"
-  !       stop 0
-end program main
-subroutine foo(N, r)
-  integer, intent(in) :: N
-  integer, intent(out) :: r
-  integer :: z, i
-  z = 1
-  ! Spawn 3 threads
-  !$omp parallel num_threads(3)
+  print*, "foo(5) returned ", a, ", expected 6\n"
 
-  ! Each thread redundantly updates z to N
-  ! i.e. 5
-  !$omp task depend(out: z) shared(z)
-  do while (i < 32766)
-     ! dumb loop to slow down the update of
-     ! z
-     i = i + 1
-  end do
-  z = N
-  !$omp end task
+  !       stop 0
+  contains
+    subroutine foo(N, r)
+      integer, intent(in) :: N
+      integer, intent(out) :: r
+      integer :: z, i, j, k, accumulator
+      z = 1
+      accumulator = 0
+      ! Spawn 3 threads
+      !$omp parallel num_threads(3)
 
-  ! z is 5 now. Each thread then offloads
-  ! increment of z by 1. So, z is incremented
-  ! three times.
-  !$omp target map(tofrom: z) depend(in: z)
-  z = z + 1
-  !$omp end target
-  !$omp end parallel
+      ! A single thread will then create two tasks
+      ! One is the 'producer' and potentially slower
+      ! task that updates 'z' to 'N'. The second is an
+      ! offloaded target task that increments 'z'.
+      ! If the depend clauses work properly, the
+      ! target task should wait for the 'producer'
+      ! task to complete before incrementing z
+      ! We use !$omp single here because only
+      ! the depend clause establishes dependencies
+      ! between sibling tasks only. This is the easiest
+      ! way of creating two sibling tasks.
+      !$omp single
+      !$omp task depend(out: z) shared(z)
+      do while (k < 32000)
+         do while (j < 32766)
+            do while (i < 32766)
+               ! dumb loop nest to slow down the update of
+               ! z
+               i = i + 1
+               ! Adding a function call slows down the producer
+               ! to the point that removing the depend clause
+               ! from the target construct below frequently
+               ! results in the wrong answer.
+               accumulator = accumulator + omp_get_device_num()
+            end do
+            j = j +1
+         end do
+         k = k + 1
+      end do
+      z = N
+      !$omp end task
 
-  ! z is 8.
-  r = z
+      ! z is 5 now. Increment z to 6.
+      !$omp target map(tofrom: z) depend(in:z)
+      z = z + 1
+      !$omp end target
+      !$omp end single
+      !$omp end parallel
+      ! Use 'accumulator' so it is not optimized away
+      ! by the compiler.
+      print *, accumulator
+      r = z
 end subroutine foo
 
 !CHECK: ======= FORTRAN Test passed! =======
-!CHECK: foo(5) returned 8 , expected 8
+!CHECK: foo(5) returned 6 , expected 6
+end program main

>From ac34fd18fc503034de96a9eb42b74ce580d61ca4 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 1 Jul 2024 17:10:06 -0500
Subject: [PATCH 22/24] Address more review comments

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  6 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 59 +++++++++++--------
 .../test/offloading/fortran/target-depend.f90 | 42 +++++--------
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7a62f00952640..6de2318c23659 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5706,9 +5706,9 @@ static void emitTargetCall(
   // The presence of certain clauses on the target directive require the
   // explicit generation of the target task.
   if (RequiresOuterTargetTask) {
-    OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
-                              EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
-                              AllocaIP, Dependencies, HasNoWait);
+    Builder.restoreIP(OMPBuilder.emitTargetTask(
+        OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
+        RTLoc, AllocaIP, Dependencies, HasNoWait));
   } else {
     Builder.restoreIP(OMPBuilder.emitKernelLaunch(
         Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e324730c39a17..6b6679c8cfecd 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -683,27 +683,38 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
 }
 
 static void
-buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
-                LLVM::ModuleTranslation &moduleTranslation,
+buildDependData(Operation *op, LLVM::ModuleTranslation &moduleTranslation,
                 SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
-  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
-    llvm::omp::RTLDependenceKindTy type;
-    switch (
-        cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
-    case mlir::omp::ClauseTaskDepend::taskdependin:
-      type = llvm::omp::RTLDependenceKindTy::DepIn;
-      break;
-    // The OpenMP runtime requires that the codegen for 'depend' clause for
-    // 'out' dependency kind must be the same as codegen for 'depend' clause
-    // with 'inout' dependency.
-    case mlir::omp::ClauseTaskDepend::taskdependout:
-    case mlir::omp::ClauseTaskDepend::taskdependinout:
-      type = llvm::omp::RTLDependenceKindTy::DepInOut;
-      break;
-    };
-    llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
-    llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
-    dds.emplace_back(dd);
+  auto processDepends = [&](std::optional<ArrayAttr> depends,
+                            OperandRange dependVars) {
+    if (dependVars.empty())
+      return;
+    for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+      llvm::omp::RTLDependenceKindTy type;
+      switch (
+          cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+      case mlir::omp::ClauseTaskDepend::taskdependin:
+        type = llvm::omp::RTLDependenceKindTy::DepIn;
+        break;
+      // The OpenMP runtime requires that the codegen for 'depend' clause for
+      // 'out' dependency kind must be the same as codegen for 'depend' clause
+      // with 'inout' dependency.
+      case mlir::omp::ClauseTaskDepend::taskdependout:
+      case mlir::omp::ClauseTaskDepend::taskdependinout:
+        type = llvm::omp::RTLDependenceKindTy::DepInOut;
+        break;
+      };
+      llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+      llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+      dds.emplace_back(dd);
+    }
+  };
+
+  if (auto taskOp = dyn_cast<omp::TaskOp>(op)) {
+    processDepends(taskOp.getDepends(), taskOp.getDependVars());
+  }
+  if (auto targetOp = dyn_cast<omp::TargetOp>(op)) {
+    processDepends(targetOp.getDepends(), targetOp.getDependVars());
   }
 }
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
@@ -729,9 +740,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  if (!taskOp.getDependVars().empty() && taskOp.getDepends())
-    buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
-                    moduleTranslation, dds);
+  buildDependData(taskOp.getOperation(), moduleTranslation, dds);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
@@ -3094,9 +3103,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  if (!targetOp.getDependVars().empty() && targetOp.getDepends())
-    buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
-                    moduleTranslation, dds);
+  buildDependData(targetOp.getOperation(), moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index e7729a8c31e6b..81e1770465a42 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -34,34 +34,25 @@ subroutine foo(N, r)
       ! Spawn 3 threads
       !$omp parallel num_threads(3)
 
-      ! A single thread will then create two tasks
-      ! One is the 'producer' and potentially slower
-      ! task that updates 'z' to 'N'. The second is an
-      ! offloaded target task that increments 'z'.
-      ! If the depend clauses work properly, the
-      ! target task should wait for the 'producer'
-      ! task to complete before incrementing z
-      ! We use !$omp single here because only
-      ! the depend clause establishes dependencies
-      ! between sibling tasks only. This is the easiest
-      ! way of creating two sibling tasks.
+      ! A single thread will then create two tasks - one is the 'producer' and
+      ! potentially slower task that updates 'z' to 'N'. The second is an
+      ! offloaded target task that increments 'z'. If the depend clauses work
+      ! properly, the target task should wait for the 'producer' task to
+      ! complete before incrementing 'z'. We use 'omp single' here because the
+      ! depend clause establishes dependencies between sibling tasks only.
+      ! This is the easiest way of creating two sibling tasks.
       !$omp single
       !$omp task depend(out: z) shared(z)
-      do while (k < 32000)
-         do while (j < 32766)
-            do while (i < 32766)
-               ! dumb loop nest to slow down the update of
-               ! z
-               i = i + 1
-               ! Adding a function call slows down the producer
-               ! to the point that removing the depend clause
-               ! from the target construct below frequently
-               ! results in the wrong answer.
+      do k=1, 32766
+         do j=1, 32766
+            do i = 1, 32766
+               ! dumb loop nest to slow down the update of 'z'.
+               ! Adding a function call slows down the producer to the point
+               ! that removing the depend clause from the target construct below
+               ! frequently results in the wrong answer.
                accumulator = accumulator + omp_get_device_num()
             end do
-            j = j +1
          end do
-         k = k + 1
       end do
       z = N
       !$omp end task
@@ -72,11 +63,10 @@ subroutine foo(N, r)
       !$omp end target
       !$omp end single
       !$omp end parallel
-      ! Use 'accumulator' so it is not optimized away
-      ! by the compiler.
+      ! Use 'accumulator' so it is not optimized away by the compiler.
       print *, accumulator
       r = z
-end subroutine foo
+    end subroutine foo
 
 !CHECK: ======= FORTRAN Test passed! =======
 !CHECK: foo(5) returned 6 , expected 6

>From c0e2ceb0d88e564ec50fc96df6869888e4d133d9 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 2 Jul 2024 08:51:46 -0500
Subject: [PATCH 23/24] Incorporate one more review comment

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6de2318c23659..b045f6c6961a3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5481,10 +5481,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
   BasicBlock *TargetTaskAllocaBB =
       splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
 
-  InsertPointTy TargetTaskAllocaIP =
-      InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
-  InsertPointTy TargetTaskBodyIP =
-      InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+  InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
+                                   TargetTaskAllocaBB->begin());
+  InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
 
   OutlineInfo OI;
   OI.EntryBB = TargetTaskAllocaBB;

>From 9b49c09084aee2fed711e30d4a1f0410c69e0095 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 9 Jul 2024 01:09:06 -0500
Subject: [PATCH 24/24] Incorporate changes from review comments

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 59 ++++++++-----------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6b6679c8cfecd..391bbacc2f6cd 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -683,38 +683,29 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
 }
 
 static void
-buildDependData(Operation *op, LLVM::ModuleTranslation &moduleTranslation,
+buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+                LLVM::ModuleTranslation &moduleTranslation,
                 SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
-  auto processDepends = [&](std::optional<ArrayAttr> depends,
-                            OperandRange dependVars) {
-    if (dependVars.empty())
-      return;
-    for (auto dep : llvm::zip(dependVars, depends->getValue())) {
-      llvm::omp::RTLDependenceKindTy type;
-      switch (
-          cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
-      case mlir::omp::ClauseTaskDepend::taskdependin:
-        type = llvm::omp::RTLDependenceKindTy::DepIn;
-        break;
-      // The OpenMP runtime requires that the codegen for 'depend' clause for
-      // 'out' dependency kind must be the same as codegen for 'depend' clause
-      // with 'inout' dependency.
-      case mlir::omp::ClauseTaskDepend::taskdependout:
-      case mlir::omp::ClauseTaskDepend::taskdependinout:
-        type = llvm::omp::RTLDependenceKindTy::DepInOut;
-        break;
-      };
-      llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
-      llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
-      dds.emplace_back(dd);
-    }
-  };
-
-  if (auto taskOp = dyn_cast<omp::TaskOp>(op)) {
-    processDepends(taskOp.getDepends(), taskOp.getDependVars());
-  }
-  if (auto targetOp = dyn_cast<omp::TargetOp>(op)) {
-    processDepends(targetOp.getDepends(), targetOp.getDependVars());
+  if (dependVars.empty())
+    return;
+  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+    llvm::omp::RTLDependenceKindTy type;
+    switch (
+        cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+    case mlir::omp::ClauseTaskDepend::taskdependin:
+      type = llvm::omp::RTLDependenceKindTy::DepIn;
+      break;
+    // The OpenMP runtime requires that the codegen for 'depend' clause for
+    // 'out' dependency kind must be the same as codegen for 'depend' clause
+    // with 'inout' dependency.
+    case mlir::omp::ClauseTaskDepend::taskdependout:
+    case mlir::omp::ClauseTaskDepend::taskdependinout:
+      type = llvm::omp::RTLDependenceKindTy::DepInOut;
+      break;
+    };
+    llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+    llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+    dds.emplace_back(dd);
   }
 }
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
@@ -740,7 +731,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(taskOp.getOperation(), moduleTranslation, dds);
+  buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+                  moduleTranslation, dds);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
@@ -3103,7 +3095,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(targetOp.getOperation(), moduleTranslation, dds);
+  buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+                  moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,



More information about the cfe-commits mailing list