[Mlir-commits] [clang] [llvm] [mlir] [OMPIRBuilder] - Handle dependencies in `createTarget` (PR #93977)
Pranav Bhandarkar
llvmlistbot at llvm.org
Fri Jul 19 10:01:30 PDT 2024
https://github.com/bhandarkar-pranav updated https://github.com/llvm/llvm-project/pull/93977
>From 8060e0bb038166ead68eb6068e6559325a605c0c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 6 May 2024 23:05:37 -0500
Subject: [PATCH 01/26] Add a flag to choose new codegen
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f6d12d46cfc07..7b6e93e2122aa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -48,6 +48,10 @@
using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
+// Experiment to make sanitizers easier to debug
+static llvm::cl::opt<bool> NewClangTargetTaskCodeGen(
+ "new-clang-target-task-codegen", llvm::cl::Optional,
+ llvm::cl::desc("new clang target task codegen."), llvm::cl::init(false));
namespace {
/// Base class for handling code generation inside OpenMP regions.
@@ -9620,9 +9624,13 @@ static void emitTargetCallKernelLaunch(
DeviceID, RTLoc, AllocaIP));
};
- if (RequiresOuterTask)
- CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
- else
+ if (RequiresOuterTask) {
+ if (NewClangTargetTaskCodeGen) {
+ llvm::errs() << "Using OMPIRBuilder for target task codegen\n";
+ } else {
+ CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
+ }
+ } else
OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
}
>From 83f09a5fb05f440d7f9de36d6bb9e693227d66ac Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 13 May 2024 11:21:33 -0500
Subject: [PATCH 02/26] clang prints for debugging
---
clang/lib/CodeGen/CGStmtOpenMP.cpp | 6 +++-
clang/lib/CodeGen/CodeGenFunction.h | 3 ++
clang/lib/Parse/ParseOpenMP.cpp | 12 +++++++-
clang/lib/Sema/SemaOpenMP.cpp | 28 ++++++++++++++++++-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 7 ++++-
5 files changed, 52 insertions(+), 4 deletions(-)
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 6410f9e102c90..200dd1878a449 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,7 +5043,11 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
Data.FirstprivateInits.emplace_back(InitRef);
return OrigVD;
}
-
+void CodeGenFunction::NewEmitOMPTargetTaskBasedDirective(
+ const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
+ OMPTargetDataInfo &InputInfo) {
+ EmitOMPTargetTaskBasedDirective(S, BodyGen, InputInfo);
+}
void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 45585361a4fc9..f30666226c4df 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3797,6 +3797,9 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo);
+ void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
+ const RegionCodeGenTy &BodyGen,
+ OMPTargetDataInfo &InputInfo);
void processInReduction(const OMPExecutableDirective &S,
OMPTaskDataTy &Data,
CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e959dd6378f46..ec07a7d3854af 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2972,11 +2972,19 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
// FIXME: We create a bogus CompoundStmt scope to hold the contents of
// the captured region. Code elsewhere assumes that any FunctionScopeInfo
// should have at least one compound statement scope within it.
+ if (AssociatedStmt.get()) {
+ llvm::errs() << __FUNCTION__ << "Loc-1:\n";
+ AssociatedStmt.get()->dump();
+ }
ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
{
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
-
+ Stmt * pdb_print = AssociatedStmt.get();
+ if (pdb_print) {
+ llvm::errs() << __FUNCTION__ << "Loc0:\n";
+ pdb_print->dump();
+ }
if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
getLangOpts().OpenMPIRBuilder)
AssociatedStmt =
@@ -2984,6 +2992,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
}
AssociatedStmt =
Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+ llvm::errs() << __FUNCTION__ << "Loc1:\n";
+ AssociatedStmt.get()->dump();
} else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
DKind == OMPD_target_exit_data) {
Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index bab61e8fd54e8..db882f52b225e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,6 +4854,19 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
+ llvm::errs() << __FUNCTION__ << ": Loc0:\n";
+ for (OpenMPDirectiveKind c : CaptureRegions) {
+ switch(c) {
+ case OMPD_task:
+ llvm::errs() << "OMPD_task\n";
+ break;
+ case OMPD_target:
+ llvm::errs() << "OMPD_target\n";
+ break;
+ default:
+ llvm::errs() << "default\n";
+ }
+ }
OMPOrderedClause *OC = nullptr;
OMPScheduleClause *SC = nullptr;
SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5005,7 +5018,11 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
}
if (++CompletedRegions == CaptureRegions.size())
DSAStack->setBodyComplete();
+ llvm::errs() << __FUNCTION__ << ": Loc1:\n";
+ SR.get()->dump();
SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
+ llvm::errs() << __FUNCTION__ << ": Loc2:\n";
+ SR.get()->dump();
}
return SR;
}
@@ -6337,7 +6354,16 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
llvm::SmallVector<OMPClause *> ClausesWithoutBind;
bool UseClausesWithoutBind = false;
-
+ if (Kind == Directive::OMPD_target) {
+ if (AStmt) {
+ llvm::errs() << __FUNCTION__ << "***********************\n";
+ AStmt->dump();
+ llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
+ AStmt->dumpPretty(getASTContext());
+ } else {
+ llvm::errs() << "__FUNCTION__" << ": AStmt is nullptr\n";
+ }
+ }
if (const OMPBindClause *BC =
OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
BindKind = BC->getBindKind();
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e54ec4f2b1d72..7e414f7406bf4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1518,7 +1518,12 @@ class OpenMPIRBuilder {
std::forward_list<CanonicalLoopInfo> LoopInfos;
/// Add a new region that will be outlined later.
- void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
+ void addOutlineInfo(OutlineInfo &&OI) {
+ llvm::errs() << "Adding outline info\n";
+ llvm::errs() << "OI.EntryBB = ";
+ OI.EntryBB->dump();
+ OutlineInfos.emplace_back(OI);
+ }
/// An ordered map of auto-generated variables to their unique names.
/// It stores variables with the following names: 1) ".gomp_critical_user_" +
>From e2aa768c4de30fd0ed52e96c70e2395a9710a929 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 13 May 2024 16:25:51 -0500
Subject: [PATCH 03/26] add an option -new-ompirbuilder-target-codegen to
enable dependency-based target codegen path in OMPIRBuilder
---
clang/lib/CodeGen/CodeGenFunction.h | 4 +-
clang/lib/Parse/ParseOpenMP.cpp | 2 +-
clang/lib/Sema/SemaOpenMP.cpp | 7 ++-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 9 +++
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 57 ++++++++++++++---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 61 +++++++++++--------
6 files changed, 102 insertions(+), 38 deletions(-)
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f30666226c4df..b80be8ed85458 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3798,8 +3798,8 @@ class CodeGenFunction : public CodeGenTypeCache {
const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo);
void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
- const RegionCodeGenTy &BodyGen,
- OMPTargetDataInfo &InputInfo);
+ const RegionCodeGenTy &BodyGen,
+ OMPTargetDataInfo &InputInfo);
void processInReduction(const OMPExecutableDirective &S,
OMPTaskDataTy &Data,
CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index ec07a7d3854af..ff6d68f616207 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2980,7 +2980,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
{
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
- Stmt * pdb_print = AssociatedStmt.get();
+ Stmt *pdb_print = AssociatedStmt.get();
if (pdb_print) {
llvm::errs() << __FUNCTION__ << "Loc0:\n";
pdb_print->dump();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index db882f52b225e..f0a3ec40dee78 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4856,7 +4856,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
llvm::errs() << __FUNCTION__ << ": Loc0:\n";
for (OpenMPDirectiveKind c : CaptureRegions) {
- switch(c) {
+ switch (c) {
case OMPD_task:
llvm::errs() << "OMPD_task\n";
break;
@@ -6358,10 +6358,11 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
if (AStmt) {
llvm::errs() << __FUNCTION__ << "***********************\n";
AStmt->dump();
- llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
+ llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
AStmt->dumpPretty(getASTContext());
} else {
- llvm::errs() << "__FUNCTION__" << ": AStmt is nullptr\n";
+ llvm::errs() << "__FUNCTION__"
+ << ": AStmt is nullptr\n";
}
}
if (const OMPBindClause *BC =
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 7e414f7406bf4..dd7605cbae6a6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2250,6 +2250,15 @@ class OpenMPIRBuilder {
/// \param BodyGenCB Callback that will generate the region code.
/// \param ArgAccessorFuncCB Callback that will generate accessors
/// instructions for passed in target arguments where neccessary
+
+ InsertPointTy newCreateTarget(
+ const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads,
+ SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
+ TargetBodyGenCallbackTy BodyGenCB,
+ TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+ SmallVector<DependData> Dependencies = {});
InsertPointTy createTarget(const LocationDescription &Loc,
OpenMPIRBuilder::InsertPointTy AllocaIP,
OpenMPIRBuilder::InsertPointTy CodeGenIP,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cb4de9c8876dc..02106c7316dca 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -73,6 +73,11 @@ static cl::opt<double> UnrollThresholdFactor(
"simplifications still taking place"),
cl::init(1.5));
+static cl::opt<bool>
+ NewOMPIRBuilderTargetCodegen("new-ompirbuilder-target-codegen", cl::Hidden,
+ cl::desc("Use target-task based codegen."),
+ cl::init(false));
+
#ifndef NDEBUG
/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
@@ -5230,12 +5235,13 @@ static void emitTargetOutlinedFunction(
OutlinedFn, OutlinedFnID);
}
-static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
- OpenMPIRBuilder::InsertPointTy AllocaIP,
- Function *OutlinedFn, Constant *OutlinedFnID,
- int32_t NumTeams, int32_t NumThreads,
- SmallVectorImpl<Value *> &Args,
- OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) {
+static void emitTargetCall(
+ OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+ OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
+ Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
+ SmallVectorImpl<Value *> &Args,
+ OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
+ SmallVector<llvm::OpenMPIRBuilder::DependData> dependencies = {}) {
OpenMPIRBuilder::TargetDataInfo Info(
/*RequiresDevicePointerInfo=*/false,
@@ -5276,12 +5282,49 @@ static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
NumTeamsVal, NumThreadsVal,
DynCGGroupMem, HasNoWait);
-
+ // PDB: here you'll have to break the logic down to do the following
+ // if (!requiresoutertask) {
+ // Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+ // Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+ // DeviceID, RTLoc, AllocaIP));
+ // else {
+ // create task
+ // make task call emitkernellaunch.
+ // make task call
+ // }
+ //
Builder.restoreIP(OMPBuilder.emitKernelLaunch(
Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
DeviceID, RTLoc, AllocaIP));
}
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
+ int32_t NumThreads, SmallVectorImpl<Value *> &Args,
+ GenMapInfoCallbackTy GenMapInfoCB,
+ OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
+ OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+ SmallVector<DependData> Dependencies) {
+ if (!NewOMPIRBuilderTargetCodegen) {
+ llvm::errs() << "Old OpenMPIRBuilder target codegen\n";
+ return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
+ NumThreads, Args, GenMapInfoCB, CBFunc,
+ ArgAccessorFuncCB);
+ }
+ llvm::errs() << "New OpenMPIRBuilder target codegen\n";
+ if (!updateToLocation(Loc))
+ return InsertPointTy();
+ Builder.restoreIP(CodeGenIP);
+ Function *OutlinedFn;
+ Constant *OutlinedFnID;
+ emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
+ OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+ if (!Config.isTargetDevice())
+ emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
+ NumThreads, Args, GenMapInfoCB, Dependencies);
+ return Builder.saveIP();
+}
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6ec4c120c11ea..2fd3aef44ebd5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -681,7 +681,30 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
ompLoc, bodyCB, numTeamsLower, numTeamsUpper, threadLimit, ifExpr));
return bodyGenStatus;
}
-
+static void
+buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+ LLVM::ModuleTranslation &moduleTranslation,
+ SmallVector<llvm::OpenMPIRBuilder::DependData> &dds) {
+ for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+ llvm::omp::RTLDependenceKindTy type;
+ switch (
+ cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+ case mlir::omp::ClauseTaskDepend::taskdependin:
+ type = llvm::omp::RTLDependenceKindTy::DepIn;
+ break;
+ // The OpenMP runtime requires that the codegen for 'depend' clause for
+ // 'out' dependency kind must be the same as codegen for 'depend' clause
+ // with 'inout' dependency.
+ case mlir::omp::ClauseTaskDepend::taskdependout:
+ case mlir::omp::ClauseTaskDepend::taskdependinout:
+ type = llvm::omp::RTLDependenceKindTy::DepInOut;
+ break;
+ };
+ llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+ llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+ dds.emplace_back(dd);
+ }
+}
/// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -705,28 +728,10 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
};
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
- if (!taskOp.getDependVars().empty() && taskOp.getDepends()) {
- for (auto dep :
- llvm::zip(taskOp.getDependVars(), taskOp.getDepends()->getValue())) {
- llvm::omp::RTLDependenceKindTy type;
- switch (
- cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
- case mlir::omp::ClauseTaskDepend::taskdependin:
- type = llvm::omp::RTLDependenceKindTy::DepIn;
- break;
- // The OpenMP runtime requires that the codegen for 'depend' clause for
- // 'out' dependency kind must be the same as codegen for 'depend' clause
- // with 'inout' dependency.
- case mlir::omp::ClauseTaskDepend::taskdependout:
- case mlir::omp::ClauseTaskDepend::taskdependinout:
- type = llvm::omp::RTLDependenceKindTy::DepInOut;
- break;
- };
- llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
- llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
- dds.emplace_back(dd);
- }
- }
+ if (!taskOp.getDependVars().empty() && taskOp.getDepends())
+ buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+ moduleTranslation, dds);
+ llvm::errs() << "# Dependencies in task op = " << dds.size() << "\n";
llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
@@ -3088,10 +3093,16 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
if (!mapData.IsDeclareTarget[i] && !mapData.IsAMember[i])
kernelInput.push_back(mapData.OriginalValue[i]);
}
+ SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
+ if (!targetOp.getDependVars().empty() && targetOp.getDepends())
+ buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+ moduleTranslation, dds);
+ llvm::errs() << "# Dependencies in target op = " << dds.size() << "\n";
- builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
+ builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
- defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
+ defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
+ dds));
// Remap access operations to declare target reference pointers for the
// device, essentially generating extra loadop's as necessary
>From e15bb967c1dde4ea0efc75d8da33be173589551b Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 16 May 2024 17:12:13 -0500
Subject: [PATCH 04/26] checkpoint commit -> able to create an inlined version
of the task that offloads
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 25 ++++-
clang/lib/CodeGen/CGStmt.cpp | 27 ++++++
clang/lib/CodeGen/CGStmtOpenMP.cpp | 2 +
clang/lib/Parse/ParseOpenMP.cpp | 14 +--
clang/lib/Sema/SemaOpenMP.cpp | 56 +++++------
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 6 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 93 +++++++++++++++++--
7 files changed, 178 insertions(+), 45 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 7b6e93e2122aa..f56c878b45df8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1320,10 +1320,18 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
HasCancel = TD->hasCancel();
CodeGenFunction CGF(CGM, true);
+ // llvm::errs() << "LLVMDEBUG::Before CGInfo\n";
+ // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
InnermostKind, HasCancel, Action);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
+ // llvm::errs() << "LLVMDEBUG::Before GenerateCapturedStmt\n";
+ // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
llvm::Function *Res = CGF.GenerateCapturedStmtFunction(*CS);
+ llvm::errs() << "LLVMDEBUG::After GenerateCapturedStmt\n";
+ llvm::errs() << "LLVMDEBUG::CapturedStmt is \n";
+ CS->dump();
+ CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
if (!Tied)
NumberOfParts = Action.getNumberOfParts();
return Res;
@@ -3707,7 +3715,16 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
TaskPrivatesMap);
- // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
+ llvm::errs() << "LLVMDEBUG::Proxy task function is \n";
+ TaskEntry->dump();
+ llvm::errs() << "LLVMDEBUG::CGF.Builder.GetInsertBlock() after emitting "
+ "proxy task function is \n";
+ CGF.Builder.GetInsertBlock()->dump();
+ llvm::errs() << "LLVMDEBUG::SharedsTy is \n";
+ CharUnits cu = C.getTypeSizeInChars(SharedsTy);
+ llvm::errs() << "LLVMDEBUG::sizeof(SharedsTy) = \n";
+ llvm::errs() << cu.getQuantity() << "\n";
+ // build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
// Task flags. Format is taken from
@@ -9549,9 +9566,15 @@ static void emitTargetCallKernelLaunch(
emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
llvm::codegenoptions::NoDebugInfo;
+ llvm::errs() << "LLVMDEBUG::After emitOffloadingArrays in "
+ "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+ OMPBuilder.Builder.GetInsertBlock()->dump();
OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
EmitDebug,
/*ForEndCall=*/false);
+ llvm::errs() << "LLVMDEBUG::After emitOffloadingArraysArgument in "
+ "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+ OMPBuilder.Builder.GetInsertBlock()->dump();
InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 99daaa14cf3fe..26baad23b87c5 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -3135,6 +3135,12 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
const RecordDecl *RD = S.getCapturedRecordDecl();
SourceLocation Loc = S.getBeginLoc();
assert(CD->hasBody() && "missing CapturedDecl body");
+ llvm::errs() << "LLVMDEBUG:: In GenerateCapturedStmtFunction\n";
+ if (Builder.GetInsertBlock()) {
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction, InsertBlock is \n";
+ Builder.GetInsertBlock()->dump();
+ }
// Build the argument list.
ASTContext &Ctx = CGM.getContext();
@@ -3156,6 +3162,13 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
// Generate the function.
StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(),
CD->getBody()->getBeginLoc());
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After StartFunction\n";
+ if (Builder.GetInsertBlock()) {
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+ Builder.GetInsertBlock()->getParent()->dump();
+ }
// Set the context parameter in CapturedStmtInfo.
Address DeclPtr = GetAddrOfLocalVar(CD->getContextParam());
CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
@@ -3181,7 +3194,21 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
}
PGO.assignRegionCounters(GlobalDecl(CD), F);
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction: Before EmitBody\n";
+ if (Builder.GetInsertBlock()) {
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+ Builder.GetInsertBlock()->getParent()->dump();
+ }
CapturedStmtInfo->EmitBody(*this, CD->getBody());
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After EmitBody\n";
+ if (Builder.GetInsertBlock()) {
+ llvm::errs()
+ << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
+ Builder.GetInsertBlock()->getParent()->dump();
+ }
FinishFunction(CD->getBodyRBrace());
return F;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 200dd1878a449..1cd3e72c38cc0 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5187,6 +5187,8 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, /*Tied=*/true,
Data.NumberOfParts);
+ llvm::errs() << "LLVMDEBUG::Outlined Task Fn is \n";
+ OutlinedFn->dump();
llvm::APInt TrueOrFalse(32, S.hasClausesOfKind<OMPNowaitClause>() ? 1 : 0);
IntegerLiteral IfCond(getContext(), TrueOrFalse,
getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index ff6d68f616207..547dd8fcf4552 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2980,11 +2980,11 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
{
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
- Stmt *pdb_print = AssociatedStmt.get();
- if (pdb_print) {
- llvm::errs() << __FUNCTION__ << "Loc0:\n";
- pdb_print->dump();
- }
+ // Stmt *pdb_print = AssociatedStmt.get();
+ // if (pdb_print) {
+ // llvm::errs() << __FUNCTION__ << "Loc0:\n";
+ // pdb_print->dump();
+ // }
if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
getLangOpts().OpenMPIRBuilder)
AssociatedStmt =
@@ -2992,8 +2992,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
}
AssociatedStmt =
Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
- llvm::errs() << __FUNCTION__ << "Loc1:\n";
- AssociatedStmt.get()->dump();
+ // llvm::errs() << __FUNCTION__ << "Loc1:\n";
+ // AssociatedStmt.get()->dump();
} else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
DKind == OMPD_target_exit_data) {
Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index f0a3ec40dee78..211b93a171dfe 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,19 +4854,19 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
- llvm::errs() << __FUNCTION__ << ": Loc0:\n";
- for (OpenMPDirectiveKind c : CaptureRegions) {
- switch (c) {
- case OMPD_task:
- llvm::errs() << "OMPD_task\n";
- break;
- case OMPD_target:
- llvm::errs() << "OMPD_target\n";
- break;
- default:
- llvm::errs() << "default\n";
- }
- }
+ // llvm::errs() << __FUNCTION__ << ": Loc0:\n";
+ // for (OpenMPDirectiveKind c : CaptureRegions) {
+ // switch (c) {
+ // case OMPD_task:
+ // llvm::errs() << "OMPD_task\n";
+ // break;
+ // case OMPD_target:
+ // llvm::errs() << "OMPD_target\n";
+ // break;
+ // default:
+ // llvm::errs() << "default\n";
+ // }
+ // }
OMPOrderedClause *OC = nullptr;
OMPScheduleClause *SC = nullptr;
SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5018,11 +5018,11 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
}
if (++CompletedRegions == CaptureRegions.size())
DSAStack->setBodyComplete();
- llvm::errs() << __FUNCTION__ << ": Loc1:\n";
- SR.get()->dump();
+ // llvm::errs() << __FUNCTION__ << ": Loc1:\n";
+ // SR.get()->dump();
SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
- llvm::errs() << __FUNCTION__ << ": Loc2:\n";
- SR.get()->dump();
+ // llvm::errs() << __FUNCTION__ << ": Loc2:\n";
+ // SR.get()->dump();
}
return SR;
}
@@ -6354,17 +6354,17 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
llvm::SmallVector<OMPClause *> ClausesWithoutBind;
bool UseClausesWithoutBind = false;
- if (Kind == Directive::OMPD_target) {
- if (AStmt) {
- llvm::errs() << __FUNCTION__ << "***********************\n";
- AStmt->dump();
- llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
- AStmt->dumpPretty(getASTContext());
- } else {
- llvm::errs() << "__FUNCTION__"
- << ": AStmt is nullptr\n";
- }
- }
+ // if (Kind == Directive::OMPD_target) {
+ // if (AStmt) {
+ // llvm::errs() << __FUNCTION__ << "***********************\n";
+ // AStmt->dump();
+ // llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
+ // AStmt->dumpPretty(getASTContext());
+ // } else {
+ // llvm::errs() << "__FUNCTION__"
+ // << ": AStmt is nullptr\n";
+ // }
+ // }
if (const OMPBindClause *BC =
OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
BindKind = BC->getBindKind();
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index dd7605cbae6a6..10a85e72ae7dc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1766,7 +1766,11 @@ class OpenMPIRBuilder {
const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
-
+ InsertPointTy emitTargetTask(IRBuilderBase &Builder, Function *OutlinedFn,
+ Value *OutlinedFnID,
+ EmitFallbackCallbackTy EmitTargetCallFallbackCB,
+ TargetKernelArgs &Args, Value *DeviceID,
+ Value *RTLoc);
/// Emit the arguments to be passed to the runtime library based on the
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
/// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02106c7316dca..e02794c255ca0 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1049,6 +1049,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
Builder.restoreIP(Loc.IP);
+ llvm::errs() << "LLVMDEBUG::KernelArgs.size() in emitTargetKernel = "
+ << KernelArgs.size() << "\n";
+
for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
llvm::Value *Arg =
Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
@@ -1757,7 +1760,9 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
assert(OutlinedFn.getNumUses() == 1 &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-
+ llvm::errs() << "LLVMDEBUG::StaleCI is \n";
+ StaleCI->dump();
+ StaleCI->getParent()->getParent()->dump();
// HasShareds is true if any variables are captured in the outlined region,
// false otherwise.
bool HasShareds = StaleCI->arg_size() > 1;
@@ -5234,7 +5239,46 @@ static void emitTargetOutlinedFunction(
OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
OutlinedFn, OutlinedFnID);
}
-
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
+ IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
+ EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+ Value *DeviceID, Value *RTLoc) {
+
+ // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
+ // "target.task.exit");
+ BasicBlock *TargetTaskBodyBB =
+ splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
+ BasicBlock *TargetTaskAllocaBB =
+ splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
+
+ InsertPointTy TargetTaskAllocaIP =
+ InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+ InsertPointTy TargetTaskBodyIP =
+ InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+
+ {
+ // debug prints block
+ llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
+ Builder.GetInsertBlock()->dump();
+ llvm::errs()
+ << "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
+ Builder.GetInsertBlock()->getParent()->getParent()->dump();
+ }
+ Builder.restoreIP(TargetTaskBodyIP);
+ Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
+ EmitTargetCallFallbackCB, Args, DeviceID,
+ RTLoc, TargetTaskAllocaIP));
+ {
+ // debug prints block
+ llvm::errs()
+ << "Insert block after emitKernelLaunch in emittargettask is \n";
+ Builder.GetInsertBlock()->dump();
+ llvm::errs()
+ << "LLVMDEBUG:: module after emitKernelLaunch in emittargettask is \n";
+ Builder.GetInsertBlock()->getParent()->getParent()->dump();
+ }
+ return Builder.saveIP();
+}
static void emitTargetCall(
OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
@@ -5288,14 +5332,44 @@ static void emitTargetCall(
// Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
// DeviceID, RTLoc, AllocaIP));
// else {
- // create task
- // make task call emitkernellaunch.
- // make task call
+ // codegen_callback = codegen callback to create task logic which should be
+ // received from openmptollvmirtranslation + emitkernellaunch
+ // create_task(codegen_callback)
+ // make task call
// }
//
- Builder.restoreIP(OMPBuilder.emitKernelLaunch(
- Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
- DeviceID, RTLoc, AllocaIP));
+ {
+ // Debug block
+ llvm::errs() << "Outlined Target Func is \n";
+ OutlinedFn->dump();
+ llvm::errs() << "CurrentInsertBlock is \n";
+ if (Builder.GetInsertBlock()) {
+ Builder.GetInsertBlock()->dump();
+ llvm::errs() << "Builder.GetInsertBlock = " << Builder.GetInsertBlock()
+ << "\n";
+ } else
+ llvm::errs() << "CurrentInsertBlock not set\n";
+
+ OpenMPIRBuilder::InsertPointTy IP = Builder.saveIP();
+ if (IP.getBlock() == nullptr) {
+ llvm::errs() << "InsertPoint block is null\n";
+ } else {
+ llvm::errs() << "IP.getBlock() = " << IP.getBlock() << "\n";
+ }
+ llvm::errs() << "AllocaIP = \n";
+ llvm::errs() << "Block:\n";
+ AllocaIP.getBlock()->dump();
+ llvm::errs() << "Point:\n";
+ AllocaIP.getPoint()->dump();
+ }
+ if (NewOMPIRBuilderTargetCodegen) {
+ OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
+ EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc);
+ } else {
+ Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+ Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+ DeviceID, RTLoc, AllocaIP));
+ }
}
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
const LocationDescription &Loc, InsertPointTy AllocaIP,
@@ -5615,6 +5689,9 @@ void OpenMPIRBuilder::emitOffloadingArrays(
return;
Builder.restoreIP(AllocaIP);
+ llvm::errs() << "LLVMDEBUG::Before emitOffloadingArrays in "
+ "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
+ Builder.GetInsertBlock()->dump();
// Detect if we have any capture size requiring runtime evaluation of the
// size so that a constant array could be eventually used.
ArrayType *PointerArrayType =
>From 53759cc4873bb5054d0f5d6ce9a12c80b6302cfe Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 20 May 2024 15:38:49 -0500
Subject: [PATCH 05/26] checkpoint commit
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 119 +++++++++++++++---
2 files changed, 101 insertions(+), 20 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 10a85e72ae7dc..70845b543e2fa 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1770,7 +1770,7 @@ class OpenMPIRBuilder {
Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB,
TargetKernelArgs &Args, Value *DeviceID,
- Value *RTLoc);
+ Value *RTLoc, InsertPointTy AllocaIP);
/// Emit the arguments to be passed to the runtime library based on the
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
/// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e02794c255ca0..889caf8e40f8a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -359,6 +359,41 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
}
+// This function creates a fake integer value and a fake use for the integer
+// value. It returns the fake value created. This is useful in modeling the
+// extra arguments to the outlined functions.
+Value *createFakeIntVal(IRBuilderBase &Builder,
+ OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
+ std::stack<Instruction *> &ToBeDeleted,
+ OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
+ const Twine &Name = "", bool AsPtr = true) {
+ Builder.restoreIP(OuterAllocaIP);
+ Instruction *FakeVal;
+ AllocaInst *FakeValAddr =
+ Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+ ToBeDeleted.push(FakeValAddr);
+
+ if (AsPtr) {
+ FakeVal = FakeValAddr;
+ } else {
+ FakeVal =
+ Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+ ToBeDeleted.push(FakeVal);
+ }
+
+ // Generate a fake use of this value
+ Builder.restoreIP(InnerAllocaIP);
+ Instruction *UseFakeVal;
+ if (AsPtr) {
+ UseFakeVal =
+ Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+ } else {
+ UseFakeVal =
+ cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
+ }
+ ToBeDeleted.push(UseFakeVal);
+ return FakeVal;
+}
// This function creates a fake integer value and a fake use for the integer
// value. It returns the fake value created. This is useful in modeling the
// extra arguments to the outlined functions.
@@ -1049,8 +1084,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
Builder.restoreIP(Loc.IP);
- llvm::errs() << "LLVMDEBUG::KernelArgs.size() in emitTargetKernel = "
- << KernelArgs.size() << "\n";
+ LLVM_DEBUG(dbgs() << "KernelArgs.size() in emitTargetKernel = "
+ << KernelArgs.size() << "\n");
for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
llvm::Value *Arg =
@@ -1760,9 +1795,13 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
assert(OutlinedFn.getNumUses() == 1 &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- llvm::errs() << "LLVMDEBUG::StaleCI is \n";
- StaleCI->dump();
- StaleCI->getParent()->getParent()->dump();
+ LLVM_DEBUG(dbgs() << "StaleCI =" << *StaleCI << "\n");
+ LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
+ << *(StaleCI->getParent()->getParent()) << "\n");
+
+ // llvm::errs() << "LLVMDEBUG::StaleCI is \n";
+ // StaleCI->dump();
+ // StaleCI->getParent()->getParent()->dump();
// HasShareds is true if any variables are captured in the outlined region,
// false otherwise.
bool HasShareds = StaleCI->arg_size() > 1;
@@ -5222,6 +5261,14 @@ static Function *createOutlinedFunction(
return Func;
}
+// define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
+// %1) #3 {
+static void
+emitProxyTaskFunction(OpenMPIRBuilder::InsertPointTy ProxyFnCallSiteIP) {
+ // Create a function with the following signature
+ LLVMContext &Ctx = ProxyFnCallSiteIP.getBlock()->getContext();
+ Type *ThreadIDTy = Type::getInt32Ty(Ctx);
+}
static void emitTargetOutlinedFunction(
OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
@@ -5242,7 +5289,7 @@ static void emitTargetOutlinedFunction(
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
- Value *DeviceID, Value *RTLoc) {
+ Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
// BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
// "target.task.exit");
@@ -5255,7 +5302,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
InsertPointTy TargetTaskBodyIP =
InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
-
+#if 0
{
// debug prints block
llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
@@ -5264,19 +5311,51 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
<< "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
Builder.GetInsertBlock()->getParent()->getParent()->dump();
}
+#endif
+ OutlineInfo OI;
+ OI.EntryBB = TargetTaskAllocaBB;
+ OI.OuterAllocaBB = AllocaIP.getBlock();
+
+ // Add the thread ID argument.
+ std::stack<Instruction *> ToBeDeleted;
+ OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+ Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
Builder.restoreIP(TargetTaskBodyIP);
+
Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, Args, DeviceID,
RTLoc, TargetTaskAllocaIP));
+ OI.ExitBB = Builder.saveIP().getBlock();
+ // OI.PostOutlineCB = [this,
+ // TargetTaskAllocaBB, ToBeDeleted](Function &OutlinedFn)
+ // mutable {
+
+ // assert(OutlinedFn.getNumUses() == 1 &&
+ // "there must be a single user for the outlined function");
+ // CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+ // llvm::errs() << "LLVMDEBUG::StaleCI in postline for targettask\n";
+ // StaleCI->dump();
+ // StaleCI->getParent()->getParent()->getParent()->dump();
+
+ // emitProxyTaskFunction(InsertPointTy(StaleCI->getParent(),
+ // StaleCI->getIterator()));
+
+ // // while (!ToBeDeleted.empty()) {
+ // // ToBeDeleted.top()->eraseFromParent();
+ // // ToBeDeleted.pop();
+ // // }
+ // };
+ addOutlineInfo(std::move(OI));
+#if 1
{
// debug prints block
- llvm::errs()
- << "Insert block after emitKernelLaunch in emittargettask is \n";
- Builder.GetInsertBlock()->dump();
- llvm::errs()
- << "LLVMDEBUG:: module after emitKernelLaunch in emittargettask is \n";
- Builder.GetInsertBlock()->getParent()->getParent()->dump();
+ LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
+ << *(Builder.GetInsertBlock()) << "\n");
+ LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
+ << *(Builder.GetInsertBlock()->getParent()->getParent())
+ << "\n");
}
+#endif
return Builder.saveIP();
}
static void emitTargetCall(
@@ -5338,6 +5417,7 @@ static void emitTargetCall(
// make task call
// }
//
+#if 0
{
// Debug block
llvm::errs() << "Outlined Target Func is \n";
@@ -5362,9 +5442,11 @@ static void emitTargetCall(
llvm::errs() << "Point:\n";
AllocaIP.getPoint()->dump();
}
+#endif
if (NewOMPIRBuilderTargetCodegen) {
OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
- EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc);
+ EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
+ AllocaIP);
} else {
Builder.restoreIP(OMPBuilder.emitKernelLaunch(
Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
@@ -5380,12 +5462,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
SmallVector<DependData> Dependencies) {
if (!NewOMPIRBuilderTargetCodegen) {
- llvm::errs() << "Old OpenMPIRBuilder target codegen\n";
+ LLVM_DEBUG(dbgs() << "Old OpenMPIRBuilder target codegen\n");
return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
NumThreads, Args, GenMapInfoCB, CBFunc,
ArgAccessorFuncCB);
}
- llvm::errs() << "New OpenMPIRBuilder target codegen\n";
+ LLVM_DEBUG(dbgs() << "New OpenMPIRBuilder target codegen\n");
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -5689,9 +5771,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
return;
Builder.restoreIP(AllocaIP);
- llvm::errs() << "LLVMDEBUG::Before emitOffloadingArrays in "
- "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
- Builder.GetInsertBlock()->dump();
+ LLVM_DEBUG(dbgs() << "Basicblock before emitOffloadingArrays\n"
+ << *(Builder.GetInsertBlock()) << "\n");
// Detect if we have any capture size requiring runtime evaluation of the
// size so that a constant array could be eventually used.
ArrayType *PointerArrayType =
>From 83cea83f05ff1da0cc80d08f7635e52492c854ed Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 22 May 2024 00:55:12 -0500
Subject: [PATCH 06/26] checkpoint commit
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 9 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 212 +++++++++++++++---
llvm/lib/IR/Instruction.cpp | 5 +
3 files changed, 196 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 70845b543e2fa..5c0eedbfc9a45 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -592,7 +592,11 @@ class OpenMPIRBuilder {
/// (filename, line, column, ...).
struct LocationDescription {
LocationDescription(const IRBuilderBase &IRB)
- : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
+ : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {
+ llvm::errs() << "In LocationDescription(const IRBuilderBase &), "
+ "IRB.GetInsertBlock() = "
+ << *IRB.GetInsertBlock() << "\n";
+ }
LocationDescription(const InsertPointTy &IP) : IP(IP) {}
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
: IP(IP), DL(DL) {}
@@ -1766,8 +1770,7 @@ class OpenMPIRBuilder {
const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
- InsertPointTy emitTargetTask(IRBuilderBase &Builder, Function *OutlinedFn,
- Value *OutlinedFnID,
+ InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB,
TargetKernelArgs &Args, Value *DeviceID,
Value *RTLoc, InsertPointTy AllocaIP);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 889caf8e40f8a..eeccf9cf64656 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -609,6 +609,10 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
FunctionCallee
OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
+ LLVM_DEBUG(dbgs() << "getOrCreateRuntimeFunction:Builder.GetInsertBlock() = "
+ << *Builder.GetInsertBlock() << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
+ << "\n");
FunctionType *FnTy = nullptr;
Function *Fn = nullptr;
@@ -655,6 +659,10 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
addAttributes(FnID, *Fn);
} else {
+ LLVM_DEBUG(dbgs() << "{else}Builder.GetInsertBlock() = "
+ << *Builder.GetInsertBlock() << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+ << Builder.GetInsertBlock() << "\n");
LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
<< " with type " << *Fn->getFunctionType() << "\n");
}
@@ -958,6 +966,11 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
}
Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
+ LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
+ LLVM_DEBUG(dbgs() << "getORCreateThreadID:Builder.GetInsertBlock() = "
+ << *Builder.GetInsertBlock() << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
+ << "\n");
return Builder.CreateCall(
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
"omp_global_thread_num");
@@ -1799,9 +1812,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
<< *(StaleCI->getParent()->getParent()) << "\n");
- // llvm::errs() << "LLVMDEBUG::StaleCI is \n";
- // StaleCI->dump();
- // StaleCI->getParent()->getParent()->dump();
// HasShareds is true if any variables are captured in the outlined region,
// false otherwise.
bool HasShareds = StaleCI->arg_size() > 1;
@@ -5263,11 +5273,71 @@ static Function *createOutlinedFunction(
// define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
// %1) #3 {
-static void
-emitProxyTaskFunction(OpenMPIRBuilder::InsertPointTy ProxyFnCallSiteIP) {
+static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
+ IRBuilderBase &Builder,
+ CallInst *StaleCI) {
// Create a function with the following signature
- LLVMContext &Ctx = ProxyFnCallSiteIP.getBlock()->getContext();
+ // define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
+ // %1) #3 {
+ Module &M = OMPBuilder.M;
+ Function *CalledFunction = StaleCI->getCalledFunction();
+ OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
+ StaleCI->getIterator());
+ LLVMContext &Ctx = StaleCI->getParent()->getContext();
Type *ThreadIDTy = Type::getInt32Ty(Ctx);
+ Type *TaskPtrTy = OMPBuilder.TaskPtr;
+ Type *TaskTy = OMPBuilder.Task;
+ auto ProxyFnTy =
+ FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
+ /* isVarArg */ false);
+ auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
+ ".omp_target_task_proxy_func",
+ Builder.GetInsertBlock()->getModule());
+ auto OldInsertPoint = Builder.saveIP();
+
+ BasicBlock *EntryBB =
+ BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
+ Builder.SetInsertPoint(EntryBB);
+
+ bool HasShareds = StaleCI->arg_size() > 1;
+ // PDB: Temporary assert.
+ assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
+ "StaleCI with shareds should have exactly two arguments.");
+ if (HasShareds) {
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ LLVM_DEBUG(dbgs() << "ArgStructType = " << *ArgStructType << "\n");
+
+ AllocaInst *NewArgStructAlloca =
+ Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
+ Value *TaskT = ProxyFn->getArg(1);
+ Value *ThreadId = ProxyFn->getArg(0);
+ LLVM_DEBUG(dbgs() << "TaskT = " << *TaskT << "\n");
+ Value *SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+
+ Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+ LoadInst *LoadShared =
+ Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+
+ // TODO: Are these alignment values correct?
+ Builder.CreateMemCpy(
+ NewArgStructAlloca,
+ NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), Shareds,
+ LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
+
+ Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
+ }
+ CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
+ CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldInsertPoint);
+ return ProxyFn;
}
static void emitTargetOutlinedFunction(
OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
@@ -5287,10 +5357,12 @@ static void emitTargetOutlinedFunction(
OutlinedFn, OutlinedFnID);
}
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
- IRBuilderBase &Builder, Function *OutlinedFn, Value *OutlinedFnID,
+ Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
+ LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
+ << ", Builder = " << &Builder << "\n");
// BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
// "target.task.exit");
BasicBlock *TargetTaskBodyBB =
@@ -5322,29 +5394,111 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
Builder.restoreIP(TargetTaskBodyIP);
+ // emitKernelLaunch makes the necessary runtime call to offload the kernel.
+ // We then outline all that code into a separate function that is called
+ // by the task wrapper function (aka Proxy task function - see
+ // emitProxyTaskFunction)
Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, Args, DeviceID,
RTLoc, TargetTaskAllocaIP));
OI.ExitBB = Builder.saveIP().getBlock();
- // OI.PostOutlineCB = [this,
- // TargetTaskAllocaBB, ToBeDeleted](Function &OutlinedFn)
- // mutable {
-
- // assert(OutlinedFn.getNumUses() == 1 &&
- // "there must be a single user for the outlined function");
- // CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- // llvm::errs() << "LLVMDEBUG::StaleCI in postline for targettask\n";
- // StaleCI->dump();
- // StaleCI->getParent()->getParent()->getParent()->dump();
-
- // emitProxyTaskFunction(InsertPointTy(StaleCI->getParent(),
- // StaleCI->getIterator()));
-
- // // while (!ToBeDeleted.empty()) {
- // // ToBeDeleted.top()->eraseFromParent();
- // // ToBeDeleted.pop();
- // // }
- // };
+ OI.PostOutlineCB = [this, ToBeDeleted](Function &OutlinedFn) mutable {
+ assert(OutlinedFn.getNumUses() == 1 &&
+ "there must be a single user for the outlined function");
+ CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+ bool HasShareds = StaleCI->arg_size() > 1;
+
+ LLVM_DEBUG(dbgs() << "StaleCI in PostOutlineCB in emitTargetTask = "
+ << *StaleCI << "\n");
+ LLVM_DEBUG(dbgs() << "Module in PostOutlineCB in emitTargetTask = "
+ << *(StaleCI->getParent()->getParent()->getParent())
+ << "\n");
+
+ Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+ LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
+ << "\n");
+
+ Builder.SetInsertPoint(StaleCI);
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr =
+ getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
+ Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+ // Gather the arguments for emitting the runtime call for
+ // @__kmpc_omp_task_alloc
+ Function *TaskAllocFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+ // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
+ // call.
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+ << *(Builder.GetInsertBlock()) << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint() = "
+ << *(Builder.GetInsertPoint()) << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint()->getParent() = "
+ << Builder.GetInsertPoint()->getParent() << "\n");
+ LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
+ << Builder.GetInsertBlock() << "\n");
+ LLVM_DEBUG(dbgs() << "In the Callback: OMPBuilder.Builder = "
+ << &this->Builder << ", Builder = " << &Builder << "\n");
+ LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
+ Value *ThreadID = getOrCreateThreadID(Ident);
+
+ // TODO : Task tied or not? See what clang does.
+
+ // Argument - `sizeof_kmp_task_t` (TaskSize)
+ // Tasksize refers to the size in bytes of kmp_task_t data structure
+ // including private vars accessed in task.
+ // TODO: add kmp_task_t_with_privates (privates)
+ Value *TaskSize = Builder.getInt64(
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
+
+ // Argument - `sizeof_shareds` (SharedsSize)
+ // SharedsSize refers to the shareds array size in the kmp_task_t data
+ // structure.
+ Value *SharedsSize = Builder.getInt64(0);
+ if (HasShareds) {
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ assert(ArgStructType && "Unable to find struct type corresponding to "
+ "arguments for extracted function");
+ SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+ }
+
+ // Argument - `flags`
+ // Task is tied iff (Flags & 1) == 1.
+ // Task is untied iff (Flags & 1) == 0.
+ // Task is final iff (Flags & 2) == 2.
+ // Task is not final iff (Flags & 2) == 0.
+ // A target task is not final and is untied.
+ Value *Flags = Builder.getInt32(0);
+
+ // Emit the @__kmpc_omp_task_alloc runtime call
+ // The runtime call returns a pointer to an area where the task captured
+ // variables must be copied before the task is run (TaskData)
+ CallInst *TaskData = Builder.CreateCall(
+ TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+ /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
+ /*task_func=*/ProxyFn});
+
+ if (HasShareds) {
+ Value *Shareds = StaleCI->getArgOperand(1);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+ Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+ SharedsSize);
+ }
+
+ // while (!ToBeDeleted.empty()) {
+ // ToBeDeleted.top()->eraseFromParent();
+ // ToBeDeleted.pop();
+ // }
+ };
addOutlineInfo(std::move(OI));
#if 1
{
@@ -5374,6 +5528,8 @@ static void emitTargetCall(
OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
/*IsNonContiguous=*/true);
+ LLVM_DEBUG(dbgs() << "OMPBuilder.Builder = " << &OMPBuilder.Builder
+ << ", Builder = " << &Builder << "\n");
OpenMPIRBuilder::TargetDataRTArgs RTArgs;
OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
!MapInfo.Names.empty());
@@ -5381,6 +5537,8 @@ static void emitTargetCall(
// emitKernelLaunch
auto &&EmitTargetCallFallbackCB =
[&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy {
+ LLVM_DEBUG(dbgs() << "EmitTargetCallFallbackCB::Builder = " << &Builder
+ << "\n");
Builder.restoreIP(IP);
Builder.CreateCall(OutlinedFn, Args);
return Builder.saveIP();
@@ -5444,7 +5602,7 @@ static void emitTargetCall(
}
#endif
if (NewOMPIRBuilderTargetCodegen) {
- OMPBuilder.emitTargetTask(Builder, OutlinedFn, OutlinedFnID,
+ OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
AllocaIP);
} else {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 29272e627a1d1..22f5b5a41fc3f 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -128,6 +128,11 @@ void Instruction::insertAfter(Instruction *InsertPos) {
BasicBlock::iterator Instruction::insertInto(BasicBlock *ParentBB,
BasicBlock::iterator It) {
assert(getParent() == nullptr && "Expected detached instruction");
+ if (!(It == ParentBB->end() || It->getParent() == ParentBB)) {
+ llvm::errs() << "ParentBB = " << *ParentBB << "\n";
+ llvm::errs() << "It = " << *It << "\n";
+ llvm::errs() << "It->getParent() = " << *It->getParent() << "\n";
+ }
assert((It == ParentBB->end() || It->getParent() == ParentBB) &&
"It not in ParentBB");
insertBefore(*ParentBB, It);
>From d5c2449079d292de324d9a72ddd0c94d38a1b1f8 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 24 May 2024 01:30:58 -0500
Subject: [PATCH 07/26] Simple test working. checkpoint commit. next steps
clean up code, unittests(?) and lit tests - basically more testing before PR
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 22 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 292 ++++++++++--------
llvm/lib/IR/BasicBlock.cpp | 6 +-
llvm/lib/IR/Function.cpp | 16 +-
llvm/lib/IR/Module.cpp | 4 +-
mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 3 +-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 2 -
7 files changed, 191 insertions(+), 154 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5c0eedbfc9a45..8a67cd4b8d9f0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -592,11 +592,7 @@ class OpenMPIRBuilder {
/// (filename, line, column, ...).
struct LocationDescription {
LocationDescription(const IRBuilderBase &IRB)
- : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {
- llvm::errs() << "In LocationDescription(const IRBuilderBase &), "
- "IRB.GetInsertBlock() = "
- << *IRB.GetInsertBlock() << "\n";
- }
+ : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
LocationDescription(const InsertPointTy &IP) : IP(IP) {}
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
: IP(IP), DL(DL) {}
@@ -1522,12 +1518,7 @@ class OpenMPIRBuilder {
std::forward_list<CanonicalLoopInfo> LoopInfos;
/// Add a new region that will be outlined later.
- void addOutlineInfo(OutlineInfo &&OI) {
- llvm::errs() << "Adding outline info\n";
- llvm::errs() << "OI.EntryBB = ";
- OI.EntryBB->dump();
- OutlineInfos.emplace_back(OI);
- }
+ void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
/// An ordered map of auto-generated variables to their unique names.
/// It stores variables with the following names: 1) ".gomp_critical_user_" +
@@ -1770,10 +1761,11 @@ class OpenMPIRBuilder {
const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
- InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID,
- EmitFallbackCallbackTy EmitTargetCallFallbackCB,
- TargetKernelArgs &Args, Value *DeviceID,
- Value *RTLoc, InsertPointTy AllocaIP);
+ InsertPointTy emitTargetTask(
+ Function *OutlinedFn, Value *OutlinedFnID,
+ EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+ Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
+ SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
/// Emit the arguments to be passed to the runtime library based on the
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
/// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index eeccf9cf64656..be0717898ff25 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -394,41 +394,42 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
ToBeDeleted.push(UseFakeVal);
return FakeVal;
}
-// This function creates a fake integer value and a fake use for the integer
-// value. It returns the fake value created. This is useful in modeling the
-// extra arguments to the outlined functions.
-Value *createFakeIntVal(IRBuilder<> &Builder,
- OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
- std::stack<Instruction *> &ToBeDeleted,
- OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
- const Twine &Name = "", bool AsPtr = true) {
- Builder.restoreIP(OuterAllocaIP);
- Instruction *FakeVal;
- AllocaInst *FakeValAddr =
- Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
- ToBeDeleted.push(FakeValAddr);
-
- if (AsPtr) {
- FakeVal = FakeValAddr;
- } else {
- FakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
- ToBeDeleted.push(FakeVal);
- }
-
- // Generate a fake use of this value
- Builder.restoreIP(InnerAllocaIP);
- Instruction *UseFakeVal;
- if (AsPtr) {
- UseFakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
- } else {
- UseFakeVal =
- cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
- }
- ToBeDeleted.push(UseFakeVal);
- return FakeVal;
-}
+// // This function creates a fake integer value and a fake use for the integer
+// // value. It returns the fake value created. This is useful in modeling the
+// // extra arguments to the outlined functions.
+// Value *createFakeIntVal(IRBuilder<> &Builder,
+// OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
+// std::stack<Instruction *> &ToBeDeleted,
+// OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
+// const Twine &Name = "", bool AsPtr = true) {
+// Builder.restoreIP(OuterAllocaIP);
+// Instruction *FakeVal;
+// AllocaInst *FakeValAddr =
+// Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+// ToBeDeleted.push(FakeValAddr);
+
+// if (AsPtr) {
+// FakeVal = FakeValAddr;
+// } else {
+// FakeVal =
+// Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+// ToBeDeleted.push(FakeVal);
+// }
+
+// // Generate a fake use of this value
+// Builder.restoreIP(InnerAllocaIP);
+// Instruction *UseFakeVal;
+// if (AsPtr) {
+// UseFakeVal =
+// Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+// } else {
+// UseFakeVal =
+// cast<BinaryOperator>(Builder.CreateAdd(FakeVal,
+// Builder.getInt32(10)));
+// }
+// ToBeDeleted.push(UseFakeVal);
+// return FakeVal;
+// }
//===----------------------------------------------------------------------===//
// OpenMPIRBuilderConfig
@@ -609,10 +610,6 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
FunctionCallee
OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
- LLVM_DEBUG(dbgs() << "getOrCreateRuntimeFunction:Builder.GetInsertBlock() = "
- << *Builder.GetInsertBlock() << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
- << "\n");
FunctionType *FnTy = nullptr;
Function *Fn = nullptr;
@@ -659,10 +656,6 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
addAttributes(FnID, *Fn);
} else {
- LLVM_DEBUG(dbgs() << "{else}Builder.GetInsertBlock() = "
- << *Builder.GetInsertBlock() << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
- << Builder.GetInsertBlock() << "\n");
LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
<< " with type " << *Fn->getFunctionType() << "\n");
}
@@ -966,11 +959,6 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
}
Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
- LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
- LLVM_DEBUG(dbgs() << "getORCreateThreadID:Builder.GetInsertBlock() = "
- << *Builder.GetInsertBlock() << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = " << Builder.GetInsertBlock()
- << "\n");
return Builder.CreateCall(
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
"omp_global_thread_num");
@@ -1097,9 +1085,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
Builder.restoreIP(Loc.IP);
- LLVM_DEBUG(dbgs() << "KernelArgs.size() in emitTargetKernel = "
- << KernelArgs.size() << "\n");
-
for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
llvm::Value *Arg =
Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
@@ -1753,6 +1738,54 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
return;
emitTaskyieldImpl(Loc);
}
+static Value *
+emitDepArray(OpenMPIRBuilder &OMPBuilder,
+ SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+ IRBuilderBase &Builder = OMPBuilder.Builder;
+ Type *DependInfo = OMPBuilder.DependInfo;
+ Module &M = OMPBuilder.M;
+
+ Value *DepArray = nullptr;
+ if (Dependencies.size()) {
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ Builder.SetInsertPoint(
+ &OldIP.getBlock()->getParent()->getEntryBlock().back());
+
+ Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
+ DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
+
+ unsigned P = 0;
+ for (const OpenMPIRBuilder::DependData &Dep : Dependencies) {
+ Value *Base =
+ Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
+ // Store the pointer to the variable
+ Value *Addr = Builder.CreateStructGEP(
+ DependInfo, Base,
+ static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
+ Value *DepValPtr =
+ Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
+ Builder.CreateStore(DepValPtr, Addr);
+ // Store the size of the variable
+ Value *Size = Builder.CreateStructGEP(
+ DependInfo, Base,
+ static_cast<unsigned int>(RTLDependInfoFields::Len));
+ Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize(
+ Dep.DepValueType)),
+ Size);
+ // Store the dependency kind
+ Value *Flags = Builder.CreateStructGEP(
+ DependInfo, Base,
+ static_cast<unsigned int>(RTLDependInfoFields::Flags));
+ Builder.CreateStore(
+ ConstantInt::get(Builder.getInt8Ty(),
+ static_cast<unsigned int>(Dep.DepKind)),
+ Flags);
+ ++P;
+ }
+ Builder.restoreIP(OldIP);
+ }
+ return DepArray;
+}
OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTask(const LocationDescription &Loc,
@@ -1808,9 +1841,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
assert(OutlinedFn.getNumUses() == 1 &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- LLVM_DEBUG(dbgs() << "StaleCI =" << *StaleCI << "\n");
- LLVM_DEBUG(dbgs() << "StateCI->getParent()->getParent() = "
- << *(StaleCI->getParent()->getParent()) << "\n");
// HasShareds is true if any variables are captured in the outlined region,
// false otherwise.
@@ -5293,7 +5323,7 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
".omp_target_task_proxy_func",
Builder.GetInsertBlock()->getModule());
- auto OldInsertPoint = Builder.saveIP();
+ // auto OldInsertPoint = Builder.saveIP();
BasicBlock *EntryBB =
BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
@@ -5328,15 +5358,17 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
// TODO: Are these alignment values correct?
Builder.CreateMemCpy(
NewArgStructAlloca,
- NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), Shareds,
+ NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), LoadShared,
LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
}
- CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
- CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+ // CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
+ // CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
+ ProxyFn->getArg(0)->setName("thread.id");
+ ProxyFn->getArg(1)->setName("task");
Builder.CreateRetVoid();
- Builder.restoreIP(OldInsertPoint);
+ // Builder.restoreIP(OldInsertPoint);
return ProxyFn;
}
static void emitTargetOutlinedFunction(
@@ -5359,12 +5391,13 @@ static void emitTargetOutlinedFunction(
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
- Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP) {
+ Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+ SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
+ bool HasNoWait) {
LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
<< ", Builder = " << &Builder << "\n");
- // BasicBlock *TargetTaskExitBB = splitBB(Builder, /*CreateBranch=*/true,
- // "target.task.exit");
+
BasicBlock *TargetTaskBodyBB =
splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
BasicBlock *TargetTaskAllocaBB =
@@ -5374,16 +5407,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
InsertPointTy TargetTaskBodyIP =
InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
-#if 0
- {
- // debug prints block
- llvm::errs() << "Insert block before emitKernelLaunch in emittargettask\n";
- Builder.GetInsertBlock()->dump();
- llvm::errs()
- << "LLVMDEBUG:: module before emitKernelLaunch in emittargettask is \n";
- Builder.GetInsertBlock()->getParent()->getParent()->dump();
- }
-#endif
+
OutlineInfo OI;
OI.EntryBB = TargetTaskAllocaBB;
OI.OuterAllocaBB = AllocaIP.getBlock();
@@ -5392,6 +5416,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
std::stack<Instruction *> ToBeDeleted;
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
+
Builder.restoreIP(TargetTaskBodyIP);
// emitKernelLaunch makes the necessary runtime call to offload the kernel.
@@ -5402,9 +5427,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
EmitTargetCallFallbackCB, Args, DeviceID,
RTLoc, TargetTaskAllocaIP));
OI.ExitBB = Builder.saveIP().getBlock();
- OI.PostOutlineCB = [this, ToBeDeleted](Function &OutlinedFn) mutable {
+ OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
+ HasNoWait](Function &OutlinedFn) mutable {
assert(OutlinedFn.getNumUses() == 1 &&
"there must be a single user for the outlined function");
+
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
bool HasShareds = StaleCI->arg_size() > 1;
@@ -5419,32 +5446,21 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
<< "\n");
Builder.SetInsertPoint(StaleCI);
+
+ // Gather the arguments for emitting the runtime call for
uint32_t SrcLocStrSize;
Constant *SrcLocStr =
getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
- // Gather the arguments for emitting the runtime call for
+
// @__kmpc_omp_task_alloc
Function *TaskAllocFn =
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
// Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
// call.
- LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
- << *(Builder.GetInsertBlock()) << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint() = "
- << *(Builder.GetInsertPoint()) << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertPoint()->getParent() = "
- << Builder.GetInsertPoint()->getParent() << "\n");
- LLVM_DEBUG(dbgs() << "Builder.GetInsertBlock() = "
- << Builder.GetInsertBlock() << "\n");
- LLVM_DEBUG(dbgs() << "In the Callback: OMPBuilder.Builder = "
- << &this->Builder << ", Builder = " << &Builder << "\n");
- LLVM_DEBUG(dbgs() << "&Builder = " << &Builder << "\n");
Value *ThreadID = getOrCreateThreadID(Ident);
- // TODO : Task tied or not? See what clang does.
-
// Argument - `sizeof_kmp_task_t` (TaskSize)
// Tasksize refers to the size in bytes of kmp_task_t data structure
// including private vars accessed in task.
@@ -5493,23 +5509,65 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
SharedsSize);
}
+ if (Dependencies.size()) {
+ Value *DepArray = emitDepArray(*this, Dependencies);
+ Function *TaskWaitFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+ Builder.CreateCall(
+ TaskWaitFn,
+ {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
+ ConstantInt::get(Builder.getInt32Ty(), 0),
+ ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+ }
- // while (!ToBeDeleted.empty()) {
- // ToBeDeleted.top()->eraseFromParent();
- // ToBeDeleted.pop();
+ // ---------------------------------------------------------------
+ // V5.2 13.8 target construct
+ // If the nowait clause is present, execution of the target task
+ // may be deferred. If the nowait clause is not present, the target task is
+ // an included task.
+ // ---------------------------------------------------------------
+ // The above means that the lack of a nowait on the target construct
+ // translates to '#pragma omp task if(0)'
+ if (!HasNoWait) {
+ // Included task.
+ Function *TaskBeginFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
+ Function *TaskCompleteFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
+ Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
+ CallInst *CI = nullptr;
+ if (HasShareds)
+ CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
+ else
+ CI = Builder.CreateCall(ProxyFn, {ThreadID});
+ CI->setDebugLoc(StaleCI->getDebugLoc());
+ Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
+ } else {
+ // Emit the @__kmpc_omp_task runtime call to spawn the task
+ Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
+ Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
+ }
+
+ StaleCI->eraseFromParent();
+ // Builder.SetInsertPoint(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+ // if (HasShareds) {
+ // LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+ // OutlinedFn.getArg(1)->replaceUsesWithIf(
+ // Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
// }
+
+ while (!ToBeDeleted.empty()) {
+ ToBeDeleted.top()->eraseFromParent();
+ ToBeDeleted.pop();
+ }
};
addOutlineInfo(std::move(OI));
-#if 1
- {
- // debug prints block
- LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
- << *(Builder.GetInsertBlock()) << "\n");
- LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
- << *(Builder.GetInsertBlock()->getParent()->getParent())
- << "\n");
- }
-#endif
+
+ LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
+ << *(Builder.GetInsertBlock()) << "\n");
+ LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
+ << *(Builder.GetInsertBlock()->getParent()->getParent())
+ << "\n");
return Builder.saveIP();
}
static void emitTargetCall(
@@ -5518,7 +5576,7 @@ static void emitTargetCall(
Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
SmallVectorImpl<Value *> &Args,
OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
- SmallVector<llvm::OpenMPIRBuilder::DependData> dependencies = {}) {
+ SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {}) {
OpenMPIRBuilder::TargetDataInfo Info(
/*RequiresDevicePointerInfo=*/false,
@@ -5559,6 +5617,8 @@ static void emitTargetCall(
Value *DynCGGroupMem = Builder.getInt32(0);
bool HasNoWait = false;
+ bool HasDependencies = Dependencies.size() > 0;
+ bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
NumTeamsVal, NumThreadsVal,
@@ -5575,36 +5635,10 @@ static void emitTargetCall(
// make task call
// }
//
-#if 0
- {
- // Debug block
- llvm::errs() << "Outlined Target Func is \n";
- OutlinedFn->dump();
- llvm::errs() << "CurrentInsertBlock is \n";
- if (Builder.GetInsertBlock()) {
- Builder.GetInsertBlock()->dump();
- llvm::errs() << "Builder.GetInsertBlock = " << Builder.GetInsertBlock()
- << "\n";
- } else
- llvm::errs() << "CurrentInsertBlock not set\n";
-
- OpenMPIRBuilder::InsertPointTy IP = Builder.saveIP();
- if (IP.getBlock() == nullptr) {
- llvm::errs() << "InsertPoint block is null\n";
- } else {
- llvm::errs() << "IP.getBlock() = " << IP.getBlock() << "\n";
- }
- llvm::errs() << "AllocaIP = \n";
- llvm::errs() << "Block:\n";
- AllocaIP.getBlock()->dump();
- llvm::errs() << "Point:\n";
- AllocaIP.getPoint()->dump();
- }
-#endif
- if (NewOMPIRBuilderTargetCodegen) {
+ if (NewOMPIRBuilderTargetCodegen && RequiresOuterTargetTask) {
OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
- AllocaIP);
+ AllocaIP, Dependencies, HasNoWait);
} else {
Builder.restoreIP(OMPBuilder.emitKernelLaunch(
Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 29f2cbf611fa3..205065aef6488 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -445,8 +445,12 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
}
void BasicBlock::dropAllReferences() {
- for (Instruction &I : *this)
+ // bool debug_on = (this->getName() == "target.task.alloca");
+ for (Instruction &I : *this) {
+ LLVM_DEBUG(dbgs() << "Dropping all references in I = " << I << "\n");
I.dropAllReferences();
+ LLVM_DEBUG(dbgs() << "After Dropping all references in I = " << I << "\n");
+ }
}
const BasicBlock *BasicBlock::getSinglePredecessor() const {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 13fa1afeaaff2..bb68755bea733 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -64,6 +64,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/ModRef.h"
#include <cassert>
@@ -71,10 +72,10 @@
#include <cstdint>
#include <cstring>
#include <string>
+#define DEBUG_TYPE "pranav"
using namespace llvm;
using ProfileCount = Function::ProfileCount;
-
// Explicit instantiations of SymbolTableListTraits since some of the methods
// are not in the public header file...
template class llvm::SymbolTableListTraits<BasicBlock>;
@@ -550,10 +551,15 @@ void Function::stealArgumentListFrom(Function &Src) {
void Function::deleteBodyImpl(bool ShouldDrop) {
setIsMaterializable(false);
-
- for (BasicBlock &BB : *this)
+ bool OldDebugFlag = DebugFlag;
+ if (this->getName() == "_QQmain..omp_par.1") {
+ DebugFlag = true;
+ }
+ for (BasicBlock &BB : *this) {
+ LLVM_DEBUG(dbgs() << "Dropping all references in " << BB << "\n");
BB.dropAllReferences();
-
+ LLVM_DEBUG(dbgs() << "After Dropping all references in " << BB << "\n");
+ }
// Delete all basic blocks. They are now unused, except possibly by
// blockaddresses, but BasicBlock's destructor takes care of those.
while (!BasicBlocks.empty())
@@ -573,7 +579,7 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
}
setValueSubclassData(getSubclassDataFromValue() & ~0xe);
}
-
+ DebugFlag = OldDebugFlag;
// Metadata is stored in a side-table.
clearMetadata();
}
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index f97dd18c736c5..4d986ded06f11 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -539,8 +539,10 @@ std::string Module::getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id,
// has "dropped all references", except operator delete.
//
void Module::dropAllReferences() {
- for (Function &F : *this)
+ for (Function &F : *this) {
+ // llvm::errs() << "Dropping all references in " << F.getName() << "\n";
F.dropAllReferences();
+ }
for (GlobalVariable &GV : globals())
GV.dropAllReferences();
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index 4558893779534..c0c03df7cbc5d 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -18,6 +18,7 @@
#include "mlir/Tools/mlir-translate/Translation.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
using namespace mlir;
@@ -30,7 +31,7 @@ void registerToLLVMIRTranslation() {
auto llvmModule = translateModuleToLLVMIR(op, llvmContext);
if (!llvmModule)
return failure();
-
+ llvm::verifyModule(*llvmModule);
llvmModule->print(output, nullptr);
return success();
},
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 2fd3aef44ebd5..022ea3af7f58a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -731,7 +731,6 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
if (!taskOp.getDependVars().empty() && taskOp.getDepends())
buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
moduleTranslation, dds);
- llvm::errs() << "# Dependencies in task op = " << dds.size() << "\n";
llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
@@ -3097,7 +3096,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
if (!targetOp.getDependVars().empty() && targetOp.getDepends())
buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
moduleTranslation, dds);
- llvm::errs() << "# Dependencies in target op = " << dds.size() << "\n";
builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
>From e32335698794a86fdf904739d01715445af81000 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 28 May 2024 15:31:06 -0500
Subject: [PATCH 08/26] clean up, clean up, everybody clean up
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 23 -----------------------
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ---
llvm/lib/IR/BasicBlock.cpp | 6 +-----
llvm/lib/IR/Function.cpp | 13 +++----------
llvm/lib/IR/Module.cpp | 4 +---
5 files changed, 5 insertions(+), 44 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f56c878b45df8..3dfc6bccc8c05 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1320,18 +1320,10 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
HasCancel = TD->hasCancel();
CodeGenFunction CGF(CGM, true);
- // llvm::errs() << "LLVMDEBUG::Before CGInfo\n";
- // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
InnermostKind, HasCancel, Action);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
- // llvm::errs() << "LLVMDEBUG::Before GenerateCapturedStmt\n";
- // CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
llvm::Function *Res = CGF.GenerateCapturedStmtFunction(*CS);
- llvm::errs() << "LLVMDEBUG::After GenerateCapturedStmt\n";
- llvm::errs() << "LLVMDEBUG::CapturedStmt is \n";
- CS->dump();
- CGF.Builder.GetInsertBlock()->getParent()->getParent()->dump();
if (!Tied)
NumberOfParts = Action.getNumberOfParts();
return Res;
@@ -3715,15 +3707,6 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
TaskPrivatesMap);
- llvm::errs() << "LLVMDEBUG::Proxy task function is \n";
- TaskEntry->dump();
- llvm::errs() << "LLVMDEBUG::CGF.Builder.GetInsertBlock() after emitting "
- "proxy task function is \n";
- CGF.Builder.GetInsertBlock()->dump();
- llvm::errs() << "LLVMDEBUG::SharedsTy is \n";
- CharUnits cu = C.getTypeSizeInChars(SharedsTy);
- llvm::errs() << "LLVMDEBUG::sizeof(SharedsTy) = \n";
- llvm::errs() << cu.getQuantity() << "\n";
// build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
@@ -9566,15 +9549,9 @@ static void emitTargetCallKernelLaunch(
emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
llvm::codegenoptions::NoDebugInfo;
- llvm::errs() << "LLVMDEBUG::After emitOffloadingArrays in "
- "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
- OMPBuilder.Builder.GetInsertBlock()->dump();
OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
EmitDebug,
/*ForEndCall=*/false);
- llvm::errs() << "LLVMDEBUG::After emitOffloadingArraysArgument in "
- "CGOpenMPRuntime.cpp::emitTargetCallKernelLaunch\n";
- OMPBuilder.Builder.GetInsertBlock()->dump();
InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index be0717898ff25..565e13c2ad8c9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5395,9 +5395,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
bool HasNoWait) {
- LLVM_DEBUG(dbgs() << "emitTargetTask:OMPBuilder.Builder = " << &this->Builder
- << ", Builder = " << &Builder << "\n");
-
BasicBlock *TargetTaskBodyBB =
splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
BasicBlock *TargetTaskAllocaBB =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 205065aef6488..29f2cbf611fa3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -445,12 +445,8 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
}
void BasicBlock::dropAllReferences() {
- // bool debug_on = (this->getName() == "target.task.alloca");
- for (Instruction &I : *this) {
- LLVM_DEBUG(dbgs() << "Dropping all references in I = " << I << "\n");
+ for (Instruction &I : *this)
I.dropAllReferences();
- LLVM_DEBUG(dbgs() << "After Dropping all references in I = " << I << "\n");
- }
}
const BasicBlock *BasicBlock::getSinglePredecessor() const {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index bb68755bea733..15259b46afe38 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -72,7 +72,6 @@
#include <cstdint>
#include <cstring>
#include <string>
-#define DEBUG_TYPE "pranav"
using namespace llvm;
using ProfileCount = Function::ProfileCount;
@@ -551,15 +550,10 @@ void Function::stealArgumentListFrom(Function &Src) {
void Function::deleteBodyImpl(bool ShouldDrop) {
setIsMaterializable(false);
- bool OldDebugFlag = DebugFlag;
- if (this->getName() == "_QQmain..omp_par.1") {
- DebugFlag = true;
- }
- for (BasicBlock &BB : *this) {
- LLVM_DEBUG(dbgs() << "Dropping all references in " << BB << "\n");
+
+ for (BasicBlock &BB : *this)
BB.dropAllReferences();
- LLVM_DEBUG(dbgs() << "After Dropping all references in " << BB << "\n");
- }
+
// Delete all basic blocks. They are now unused, except possibly by
// blockaddresses, but BasicBlock's destructor takes care of those.
while (!BasicBlocks.empty())
@@ -579,7 +573,6 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
}
setValueSubclassData(getSubclassDataFromValue() & ~0xe);
}
- DebugFlag = OldDebugFlag;
// Metadata is stored in a side-table.
clearMetadata();
}
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 4d986ded06f11..f97dd18c736c5 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -539,10 +539,8 @@ std::string Module::getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id,
// has "dropped all references", except operator delete.
//
void Module::dropAllReferences() {
- for (Function &F : *this) {
- // llvm::errs() << "Dropping all references in " << F.getName() << "\n";
+ for (Function &F : *this)
F.dropAllReferences();
- }
for (GlobalVariable &GV : globals())
GV.dropAllReferences();
>From 264dfa6be0544f325b8c480f0d521f12a3fed6fd Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Wed, 29 May 2024 15:43:49 -0500
Subject: [PATCH 09/26] Add an MLIR lit test
---
mlir/test/Target/LLVMIR/omptarget-depend.mlir | 140 ++++++++++++++++++
1 file changed, 140 insertions(+)
create mode 100644 mlir/test/Target/LLVMIR/omptarget-depend.mlir
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
new file mode 100644
index 0000000000000..c386342005e5e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+ llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+ %0 = llvm.mlir.constant(39 : index) : i64
+ %1 = llvm.mlir.constant(0 : index) : i64
+ %2 = llvm.mlir.constant(1 : index) : i64
+ %3 = llvm.mlir.constant(40 : index) : i64
+ %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr
+ %5 = llvm.mlir.addressof @_QFEb : !llvm.ptr
+ %6 = llvm.mlir.constant(1 : i64) : i64
+ %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ %8 = llvm.mlir.addressof @_QFEn : !llvm.ptr
+ omp.task {
+ %14 = llvm.mlir.constant(1 : i64) : i64
+ %15 = llvm.alloca %14 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+ %16 = llvm.load %8 : !llvm.ptr -> i32
+ %17 = llvm.sext %16 : i32 to i64
+ %18 = llvm.trunc %2 : i64 to i32
+ llvm.br ^bb1(%18, %17 : i32, i64)
+ ^bb1(%19: i32, %20: i64): // 2 preds: ^bb0, ^bb2
+ %21 = llvm.icmp "sgt" %20, %1 : i64
+ llvm.cond_br %21, ^bb2, ^bb3
+ ^bb2: // pred: ^bb1
+ llvm.store %19, %15 : i32, !llvm.ptr
+ %22 = llvm.load %15 : !llvm.ptr -> i32
+ %23 = llvm.sext %22 : i32 to i64
+ %24 = llvm.mlir.constant(1 : i64) : i64
+ %25 = llvm.mlir.constant(0 : i64) : i64
+ %26 = llvm.sub %23, %24 overflow<nsw> : i64
+ %27 = llvm.mul %26, %24 overflow<nsw> : i64
+ %28 = llvm.mul %27, %24 overflow<nsw> : i64
+ %29 = llvm.add %28, %25 overflow<nsw> : i64
+ %30 = llvm.mul %24, %3 overflow<nsw> : i64
+ %31 = llvm.getelementptr %4[%29] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %22, %31 : i32, !llvm.ptr
+ %32 = llvm.load %15 : !llvm.ptr -> i32
+ %33 = llvm.add %32, %18 : i32
+ %34 = llvm.sub %20, %2 : i64
+ llvm.br ^bb1(%33, %34 : i32, i64)
+ ^bb3: // pred: ^bb1
+ llvm.store %19, %15 : i32, !llvm.ptr
+ omp.terminator
+ }
+ %9 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%3 : i64) stride(%2 : i64) start_idx(%2 : i64)
+ %10 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(to) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "a"}
+ %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"}
+ %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+ %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+ omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
+ %14 = llvm.mlir.constant(0 : index) : i64
+ %15 = llvm.mlir.constant(10 : i32) : i32
+ %16 = llvm.mlir.constant(1 : index) : i64
+ %17 = llvm.mlir.constant(40 : index) : i64
+ %18 = llvm.load %arg3 : !llvm.ptr -> i32
+ %19 = llvm.sext %18 : i32 to i64
+ %20 = llvm.trunc %16 : i64 to i32
+ llvm.br ^bb1(%20, %19 : i32, i64)
+ ^bb1(%21: i32, %22: i64): // 2 preds: ^bb0, ^bb2
+ %23 = llvm.icmp "sgt" %22, %14 : i64
+ llvm.cond_br %23, ^bb2, ^bb3
+ ^bb2: // pred: ^bb1
+ llvm.store %21, %arg2 : i32, !llvm.ptr
+ %24 = llvm.load %arg2 : !llvm.ptr -> i32
+ %25 = llvm.sext %24 : i32 to i64
+ %26 = llvm.mlir.constant(1 : i64) : i64
+ %27 = llvm.mlir.constant(0 : i64) : i64
+ %28 = llvm.sub %25, %26 overflow<nsw> : i64
+ %29 = llvm.mul %28, %26 overflow<nsw> : i64
+ %30 = llvm.mul %29, %26 overflow<nsw> : i64
+ %31 = llvm.add %30, %27 overflow<nsw> : i64
+ %32 = llvm.mul %26, %17 overflow<nsw> : i64
+ %33 = llvm.getelementptr %arg0[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %34 = llvm.load %33 : !llvm.ptr -> i32
+ %35 = llvm.add %34, %15 : i32
+ %36 = llvm.mlir.constant(1 : i64) : i64
+ %37 = llvm.mlir.constant(0 : i64) : i64
+ %38 = llvm.sub %25, %36 overflow<nsw> : i64
+ %39 = llvm.mul %38, %36 overflow<nsw> : i64
+ %40 = llvm.mul %39, %36 overflow<nsw> : i64
+ %41 = llvm.add %40, %37 overflow<nsw> : i64
+ %42 = llvm.mul %36, %17 overflow<nsw> : i64
+ %43 = llvm.getelementptr %arg1[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %35, %43 : i32, !llvm.ptr
+ %44 = llvm.load %arg2 : !llvm.ptr -> i32
+ %45 = llvm.add %44, %20 : i32
+ %46 = llvm.sub %22, %16 : i64
+ llvm.br ^bb1(%45, %46 : i32, i64)
+ ^bb3: // pred: ^bb1
+ llvm.store %21, %arg2 : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+ }
+ llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+ %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+ llvm.return %0 : !llvm.array<40 x i32>
+ }
+ llvm.mlir.global internal @_QFEb() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+ %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+ llvm.return %0 : !llvm.array<40 x i32>
+ }
+ llvm.mlir.global internal @_QFEc() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
+ %0 = llvm.mlir.zero : !llvm.array<40 x i32>
+ llvm.return %0 : !llvm.array<40 x i32>
+ }
+ llvm.mlir.global internal @_QFEn() {addr_space = 0 : i32} : i32 {
+ %0 = llvm.mlir.constant(40 : i32) : i32
+ llvm.return %0 : i32
+ }
+ llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+ llvm.func @_FortranAProgramEndStatement() attributes {sym_visibility = "private"}
+ llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ %1 = llvm.mlir.zero : !llvm.ptr
+ llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+ llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+ llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
+ llvm.return %0 : i32
+ }
+
+// %strucArg holds pointers to shared data.
+// CHECK: define void @_QQmain() {
+// CHECK-DAG: %[[STRUCTARG:.+]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK-DAG: %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK: %[[DEP_INFO:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
+// CHECK: %[[PTR0:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 0
+// CHECK: store i64 ptrtoint (ptr @_QFEa to i64), ptr %[[PTR0]], align 4
+// CHECK: %[[PTR1:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 1
+// CHECK: store i64 8, ptr %[[PTR1]], align 4
+// CHECK: %[[PTR2:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 2
+// CHECK: store i8 1, ptr %[[PTR2]], align 1
+
+// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
+// CHECK: call void @__kmpc_omp_wait_deps({{.+}}, i32 1, ptr %[[DEP_ARRAY]], i32 0, ptr null)
+// CHECK: call void @__kmpc_omp_task_begin_if0({{.+}}, ptr %[[TASKDATA]])
+// CHECK: call void @.omp_target_task_proxy_func({{.+}}, ptr %[[TASKDATA]])
+// CHECK: call void @__kmpc_omp_task_complete_if0({{.+}}, ptr %[[TASKDATA]])
+
>From 25021ebeca17e7268c36c85d0dd0172ff41925c5 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:08:29 -0500
Subject: [PATCH 10/26] add an end-to-end offloading test for target depend
---
.../test/offloading/fortran/target-depend.f90 | 40 +++++++++++++++++++
1 file changed, 40 insertions(+)
create mode 100644 offload/test/offloading/fortran/target-depend.f90
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
new file mode 100644
index 0000000000000..6a05cf4c025e8
--- /dev/null
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -0,0 +1,40 @@
+! Offloading test checking interaction of fixed size
+! arrays with enter, exit and target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+ integer :: a = 0
+ call foo(5, a)
+ print*, "======= FORTRAN Test passed! ======="
+ print*, "foo(5) returned ", a, ", expected 6\n"
+ ! stop 0
+end program main
+subroutine foo(N, r)
+ integer, intent(in) :: N
+ integer, intent(out) :: r
+ integer :: z
+
+ z = 1
+ !$omp task depend(out: z) shared(z)
+ ! print*, "N is ", N
+ ! print*, "z is ", z
+ z = N
+! print*, "z is ", z
+ !$omp end task
+
+ !$omp target map(tofrom: z) depend(in: z)
+ z = z + 1
+ !$omp end target
+
+ r = z
+end subroutine foo
+
+!CHECK: ======= FORTRAN Test passed! =======
+!CHECK: foo(5) returned 6 , expected 6
>From e484ee2de917e3bed923a4f4a68d815900fd1c4f Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:38:12 -0500
Subject: [PATCH 11/26] Clean up, clean up, everybody clean up (some more)
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 16 +++--------
clang/lib/CodeGen/CGStmt.cpp | 27 ------------------
clang/lib/CodeGen/CGStmtOpenMP.cpp | 7 -----
clang/lib/CodeGen/CodeGenFunction.h | 3 --
clang/lib/Parse/ParseOpenMP.cpp | 11 --------
clang/lib/Sema/SemaOpenMP.cpp | 28 -------------------
llvm/lib/IR/Function.cpp | 1 -
llvm/lib/IR/Instruction.cpp | 5 ----
mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 3 +-
.../test/offloading/fortran/target-depend.f90 | 3 --
10 files changed, 5 insertions(+), 99 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3dfc6bccc8c05..f6d12d46cfc07 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -48,10 +48,6 @@
using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
-// Experiment to make sanitizers easier to debug
-static llvm::cl::opt<bool> NewClangTargetTaskCodeGen(
- "new-clang-target-task-codegen", llvm::cl::Optional,
- llvm::cl::desc("new clang target task codegen."), llvm::cl::init(false));
namespace {
/// Base class for handling code generation inside OpenMP regions.
@@ -3707,7 +3703,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
TaskPrivatesMap);
- // build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
+ // Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
// Task flags. Format is taken from
@@ -9624,13 +9620,9 @@ static void emitTargetCallKernelLaunch(
DeviceID, RTLoc, AllocaIP));
};
- if (RequiresOuterTask) {
- if (NewClangTargetTaskCodeGen) {
- llvm::errs() << "Using OMPIRBuilder for target task codegen\n";
- } else {
- CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
- }
- } else
+ if (RequiresOuterTask)
+ CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
+ else
OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
}
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 26baad23b87c5..99daaa14cf3fe 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -3135,12 +3135,6 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
const RecordDecl *RD = S.getCapturedRecordDecl();
SourceLocation Loc = S.getBeginLoc();
assert(CD->hasBody() && "missing CapturedDecl body");
- llvm::errs() << "LLVMDEBUG:: In GenerateCapturedStmtFunction\n";
- if (Builder.GetInsertBlock()) {
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction, InsertBlock is \n";
- Builder.GetInsertBlock()->dump();
- }
// Build the argument list.
ASTContext &Ctx = CGM.getContext();
@@ -3162,13 +3156,6 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
// Generate the function.
StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, CD->getLocation(),
CD->getBody()->getBeginLoc());
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After StartFunction\n";
- if (Builder.GetInsertBlock()) {
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
- Builder.GetInsertBlock()->getParent()->dump();
- }
// Set the context parameter in CapturedStmtInfo.
Address DeclPtr = GetAddrOfLocalVar(CD->getContextParam());
CapturedStmtInfo->setContextValue(Builder.CreateLoad(DeclPtr));
@@ -3194,21 +3181,7 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
}
PGO.assignRegionCounters(GlobalDecl(CD), F);
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction: Before EmitBody\n";
- if (Builder.GetInsertBlock()) {
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
- Builder.GetInsertBlock()->getParent()->dump();
- }
CapturedStmtInfo->EmitBody(*this, CD->getBody());
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction: After EmitBody\n";
- if (Builder.GetInsertBlock()) {
- llvm::errs()
- << "LLVMDEBUG:: In GenerateCapturedStmtFunction, Function is \n";
- Builder.GetInsertBlock()->getParent()->dump();
- }
FinishFunction(CD->getBodyRBrace());
return F;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 1cd3e72c38cc0..040b52a1101dd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,11 +5043,6 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
Data.FirstprivateInits.emplace_back(InitRef);
return OrigVD;
}
-void CodeGenFunction::NewEmitOMPTargetTaskBasedDirective(
- const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
- OMPTargetDataInfo &InputInfo) {
- EmitOMPTargetTaskBasedDirective(S, BodyGen, InputInfo);
-}
void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo) {
@@ -5187,8 +5182,6 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, /*Tied=*/true,
Data.NumberOfParts);
- llvm::errs() << "LLVMDEBUG::Outlined Task Fn is \n";
- OutlinedFn->dump();
llvm::APInt TrueOrFalse(32, S.hasClausesOfKind<OMPNowaitClause>() ? 1 : 0);
IntegerLiteral IfCond(getContext(), TrueOrFalse,
getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b80be8ed85458..45585361a4fc9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3797,9 +3797,6 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo);
- void NewEmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S,
- const RegionCodeGenTy &BodyGen,
- OMPTargetDataInfo &InputInfo);
void processInReduction(const OMPExecutableDirective &S,
OMPTaskDataTy &Data,
CodeGenFunction &CGF,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 547dd8fcf4552..33debdd3b1476 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2972,19 +2972,10 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
// FIXME: We create a bogus CompoundStmt scope to hold the contents of
// the captured region. Code elsewhere assumes that any FunctionScopeInfo
// should have at least one compound statement scope within it.
- if (AssociatedStmt.get()) {
- llvm::errs() << __FUNCTION__ << "Loc-1:\n";
- AssociatedStmt.get()->dump();
- }
ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
{
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
- // Stmt *pdb_print = AssociatedStmt.get();
- // if (pdb_print) {
- // llvm::errs() << __FUNCTION__ << "Loc0:\n";
- // pdb_print->dump();
- // }
if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
getLangOpts().OpenMPIRBuilder)
AssociatedStmt =
@@ -2992,8 +2983,6 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
}
AssociatedStmt =
Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
- // llvm::errs() << __FUNCTION__ << "Loc1:\n";
- // AssociatedStmt.get()->dump();
} else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
DKind == OMPD_target_exit_data) {
Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 211b93a171dfe..b37a17d0e72a5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4854,19 +4854,6 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
- // llvm::errs() << __FUNCTION__ << ": Loc0:\n";
- // for (OpenMPDirectiveKind c : CaptureRegions) {
- // switch (c) {
- // case OMPD_task:
- // llvm::errs() << "OMPD_task\n";
- // break;
- // case OMPD_target:
- // llvm::errs() << "OMPD_target\n";
- // break;
- // default:
- // llvm::errs() << "default\n";
- // }
- // }
OMPOrderedClause *OC = nullptr;
OMPScheduleClause *SC = nullptr;
SmallVector<const OMPLinearClause *, 4> LCs;
@@ -5018,11 +5005,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
}
if (++CompletedRegions == CaptureRegions.size())
DSAStack->setBodyComplete();
- // llvm::errs() << __FUNCTION__ << ": Loc1:\n";
- // SR.get()->dump();
SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
- // llvm::errs() << __FUNCTION__ << ": Loc2:\n";
- // SR.get()->dump();
}
return SR;
}
@@ -6354,17 +6337,6 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
llvm::SmallVector<OMPClause *> ClausesWithoutBind;
bool UseClausesWithoutBind = false;
- // if (Kind == Directive::OMPD_target) {
- // if (AStmt) {
- // llvm::errs() << __FUNCTION__ << "***********************\n";
- // AStmt->dump();
- // llvm::errs() << __FUNCTION__ << "***PRETTY***\n";
- // AStmt->dumpPretty(getASTContext());
- // } else {
- // llvm::errs() << "__FUNCTION__"
- // << ": AStmt is nullptr\n";
- // }
- // }
if (const OMPBindClause *BC =
OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
BindKind = BC->getBindKind();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 15259b46afe38..74a6fa80f1f7f 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -64,7 +64,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/ModRef.h"
#include <cassert>
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 22f5b5a41fc3f..29272e627a1d1 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -128,11 +128,6 @@ void Instruction::insertAfter(Instruction *InsertPos) {
BasicBlock::iterator Instruction::insertInto(BasicBlock *ParentBB,
BasicBlock::iterator It) {
assert(getParent() == nullptr && "Expected detached instruction");
- if (!(It == ParentBB->end() || It->getParent() == ParentBB)) {
- llvm::errs() << "ParentBB = " << *ParentBB << "\n";
- llvm::errs() << "It = " << *It << "\n";
- llvm::errs() << "It->getParent() = " << *It->getParent() << "\n";
- }
assert((It == ParentBB->end() || It->getParent() == ParentBB) &&
"It not in ParentBB");
insertBefore(*ParentBB, It);
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index c0c03df7cbc5d..4558893779534 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -18,7 +18,6 @@
#include "mlir/Tools/mlir-translate/Translation.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
using namespace mlir;
@@ -31,7 +30,7 @@ void registerToLLVMIRTranslation() {
auto llvmModule = translateModuleToLLVMIR(op, llvmContext);
if (!llvmModule)
return failure();
- llvm::verifyModule(*llvmModule);
+
llvmModule->print(output, nullptr);
return success();
},
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index 6a05cf4c025e8..db58f2db6bbe9 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -23,10 +23,7 @@ subroutine foo(N, r)
z = 1
!$omp task depend(out: z) shared(z)
- ! print*, "N is ", N
- ! print*, "z is ", z
z = N
-! print*, "z is ", z
!$omp end task
!$omp target map(tofrom: z) depend(in: z)
>From febda4fbf1690d0dbe2bb639d106ef28b481bdff Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 13:41:27 -0500
Subject: [PATCH 12/26] Add back some ws that was removed
---
clang/lib/CodeGen/CGStmtOpenMP.cpp | 1 +
clang/lib/Parse/ParseOpenMP.cpp | 1 +
clang/lib/Sema/SemaOpenMP.cpp | 1 +
llvm/lib/IR/Function.cpp | 2 ++
4 files changed, 5 insertions(+)
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 040b52a1101dd..6410f9e102c90 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5043,6 +5043,7 @@ createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
Data.FirstprivateInits.emplace_back(InitRef);
return OrigVD;
}
+
void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen,
OMPTargetDataInfo &InputInfo) {
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 33debdd3b1476..e959dd6378f46 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2976,6 +2976,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
{
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
+
if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
getLangOpts().OpenMPIRBuilder)
AssociatedStmt =
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index b37a17d0e72a5..bab61e8fd54e8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6337,6 +6337,7 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
llvm::SmallVector<OMPClause *> ClausesWithoutBind;
bool UseClausesWithoutBind = false;
+
if (const OMPBindClause *BC =
OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
BindKind = BC->getBindKind();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 74a6fa80f1f7f..13fa1afeaaff2 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -74,6 +74,7 @@
using namespace llvm;
using ProfileCount = Function::ProfileCount;
+
// Explicit instantiations of SymbolTableListTraits since some of the methods
// are not in the public header file...
template class llvm::SymbolTableListTraits<BasicBlock>;
@@ -572,6 +573,7 @@ void Function::deleteBodyImpl(bool ShouldDrop) {
}
setValueSubclassData(getSubclassDataFromValue() & ~0xe);
}
+
// Metadata is stored in a side-table.
clearMetadata();
}
>From 9cc0234e9e9533883ecea4b71119e497e89d9e81 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:05:16 -0500
Subject: [PATCH 13/26] More cleanup and comments
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 14 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 172 ++++++++++++------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 2 +-
3 files changed, 120 insertions(+), 68 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8a67cd4b8d9f0..2ed130c87d40b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2249,15 +2249,8 @@ class OpenMPIRBuilder {
/// \param BodyGenCB Callback that will generate the region code.
/// \param ArgAccessorFuncCB Callback that will generate accessors
/// instructions for passed in target arguments where neccessary
-
- InsertPointTy newCreateTarget(
- const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP,
- OpenMPIRBuilder::InsertPointTy CodeGenIP,
- TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads,
- SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
- TargetBodyGenCallbackTy BodyGenCB,
- TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
- SmallVector<DependData> Dependencies = {});
+ /// \param Dependencies A vector of DependData objects that carry
+ // dependency information as passed in the depend clause
InsertPointTy createTarget(const LocationDescription &Loc,
OpenMPIRBuilder::InsertPointTy AllocaIP,
OpenMPIRBuilder::InsertPointTy CodeGenIP,
@@ -2266,7 +2259,8 @@ class OpenMPIRBuilder {
SmallVectorImpl<Value *> &Inputs,
GenMapInfoCallbackTy GenMapInfoCB,
TargetBodyGenCallbackTy BodyGenCB,
- TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
+ TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+ SmallVector<DependData> Dependencies = {});
/// Returns __kmpc_for_static_init_* runtime function for the specified
/// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 565e13c2ad8c9..da2bf360cc8fd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -73,11 +73,6 @@ static cl::opt<double> UnrollThresholdFactor(
"simplifications still taking place"),
cl::init(1.5));
-static cl::opt<bool>
- NewOMPIRBuilderTargetCodegen("new-ompirbuilder-target-codegen", cl::Hidden,
- cl::desc("Use target-task based codegen."),
- cl::init(false));
-
#ifndef NDEBUG
/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
@@ -833,6 +828,9 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
if (!OffloadInfoManager.empty())
createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+ LLVM_DEBUG(dbgs() << "Module after OMPIRBuilder::finalize\n");
+ LLVM_DEBUG(dbgs() << M << "\n");
}
OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -5301,15 +5299,18 @@ static Function *createOutlinedFunction(
return Func;
}
-// define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
-// %1) #3 {
+// Create an entry point for a target task with the following.
+// It'll have the following signature
+// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
+// This function is called from emitTargetTask once the
+// code to launch the target kernel has been outlined already.
static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
IRBuilderBase &Builder,
CallInst *StaleCI) {
- // Create a function with the following signature
- // define internal i32 @.omp_task_entry..3(i32 noundef %0, ptr noalias noundef
- // %1) #3 {
Module &M = OMPBuilder.M;
+ // CalledFunction is the target launch function, i.e.
+ // the function that sets up kernel arguments and calls
+ // __tgt_target_kernel to launch the kernel on the device.
Function *CalledFunction = StaleCI->getCalledFunction();
OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
StaleCI->getIterator());
@@ -5323,14 +5324,16 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
".omp_target_task_proxy_func",
Builder.GetInsertBlock()->getModule());
- // auto OldInsertPoint = Builder.saveIP();
BasicBlock *EntryBB =
BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
Builder.SetInsertPoint(EntryBB);
bool HasShareds = StaleCI->arg_size() > 1;
- // PDB: Temporary assert.
+ // TODO: This is a temporary assert to prove to ourselves that
+ // the outlined target launch function is always going to have
+ // atmost two arguments if there is any data shared between
+ // host and device.
assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
"StaleCI with shareds should have exactly two arguments.");
if (HasShareds) {
@@ -5363,12 +5366,9 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
}
- // CalledFunction->removeFnAttr(llvm::Attribute::NoInline);
- // CalledFunction->addFnAttr(llvm::Attribute::AlwaysInline);
ProxyFn->getArg(0)->setName("thread.id");
ProxyFn->getArg(1)->setName("task");
Builder.CreateRetVoid();
- // Builder.restoreIP(OldInsertPoint);
return ProxyFn;
}
static void emitTargetOutlinedFunction(
@@ -5395,6 +5395,87 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
bool HasNoWait) {
+ // When we arrive at this function, the target region itself has been
+ // outlined into the function OutlinedFn.
+ // So at ths point, for
+ // --------------------------------------------------
+ // void user_code_that_offloads(...) {
+ // omp target depend(..) map(from:a) map(to:b, c)
+ // a = b + c
+ // }
+ //
+ // --------------------------------------------------
+ //
+ // we have
+ //
+ // --------------------------------------------------
+ //
+ // void user_code_that_offloads(...) {
+ // %.offload_baseptrs = alloca [3 x ptr], align 8
+ // %.offload_ptrs = alloca [3 x ptr], align 8
+ // %.offload_mappers = alloca [3 x ptr], align 8
+ // ;; target region has been outlined and now we need to
+ // ;; offload to it via a target task.
+ // }
+ // void outlined_device_function(ptr a, ptr b, ptr c) {
+ // *a = *b + *c
+ // }
+ //
+ // We have to now do the following
+ // (i) Make an offloading call to outlined_device_function using the OpenMP RTL
+ // See 'kernel_launch_function' in the pseudo code below. This is emitted by
+ // emitKernelLaunch
+ // (ii) Create a task entry point function that calls kernel_launch_function and
+ // is the entry point for the target task. See '@.omp_target_task_proxy_func
+ // in the pseudocode below.
+ // (iii) Create a task with the task entry point created in (ii)
+ //
+ // That is we create the following
+ //
+ // void user_code_that_offloads(...) {
+ // %.offload_baseptrs = alloca [3 x ptr], align 8
+ // %.offload_ptrs = alloca [3 x ptr], align 8
+ // %.offload_mappers = alloca [3 x ptr], align 8
+ //
+ // %structArg = alloca { ptr, ptr, ptr }, align 8
+ // %strucArg[0] = %.offload_baseptrs
+ // %strucArg[1] = %.offload_ptrs
+ // %strucArg[2] = %.offload_mappers
+ // proxy_target_task = @__kmpc_omp_task_alloc(..., @.omp_target_task_proxy_func)
+ // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+ // dependencies_array = alloca [
+ // ;; if nowait not present
+ // call @__kmpc_omp_wait_deps(..., dependencies_array)
+ // call @__kmpc_omp_task_begin_if0(...)
+ // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr %proxy_target_task)
+ // call @__kmpc_omp_task_complete_if0(...)
+ // }
+ //
+ // define internal void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) {
+ // %structArg = alloca {ptr, ptr, ptr}
+ // %shared_data = load (getelementptr %task, 0, 0)
+ // mempcy(%structArg, %shared_data, sizeof(structArg))
+ // kernel_launch_function(%thread.id, %structArg)
+ // }
+ //
+ // We need the proxy function because the signature of the task entry point expected
+ // by kmpc_omp_task is always the same and will be different from that of the
+ // kernel_launch function.
+ //
+ // kernel_launch_function is generated by emitKernelLaunch and has the always_inline
+ // attribute.
+ // void kernel_launch_function(thread_id, structArg) alwaysinline {
+ // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
+ // offload_baseptrs = load(getelementptr structArg, 0, 0)
+ // offload_ptrs = load(getelementptr structArg, 0, 1)
+ // offload_mappers = load(getelementptr structArg, 0, 2)
+ // ; setup kernel_args using offload_baseptrs, offload_ptrs and offload_mappers
+ // call i32 @__tgt_target_kernel(..., outlined_device_function, ptr %kernel_args)
+ // }
+ // void outlined_device_function(ptr a, ptr b, ptr c) {
+ // *a = *b + *c
+ // }
+ //
BasicBlock *TargetTaskBodyBB =
splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
BasicBlock *TargetTaskAllocaBB =
@@ -5417,12 +5498,14 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Builder.restoreIP(TargetTaskBodyIP);
// emitKernelLaunch makes the necessary runtime call to offload the kernel.
- // We then outline all that code into a separate function that is called
- // by the task wrapper function (aka Proxy task function - see
- // emitProxyTaskFunction)
+ // We then outline all that code into a separate function ('kernel_launch_function' in
+ // the pseudo code above). This function is then called by the target task proxy
+ // function (see '@.omp_target_task_proxy_func' in the pseudo code above)
+ // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, Args, DeviceID,
RTLoc, TargetTaskAllocaIP));
+
OI.ExitBB = Builder.saveIP().getBlock();
OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
HasNoWait](Function &OutlinedFn) mutable {
@@ -5439,6 +5522,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
<< "\n");
Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+
LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
<< "\n");
@@ -5546,13 +5630,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
}
StaleCI->eraseFromParent();
- // Builder.SetInsertPoint(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
- // if (HasShareds) {
- // LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
- // OutlinedFn.getArg(1)->replaceUsesWithIf(
- // Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
- // }
-
while (!ToBeDeleted.empty()) {
ToBeDeleted.top()->eraseFromParent();
ToBeDeleted.pop();
@@ -5583,8 +5660,6 @@ static void emitTargetCall(
OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
/*IsNonContiguous=*/true);
- LLVM_DEBUG(dbgs() << "OMPBuilder.Builder = " << &OMPBuilder.Builder
- << ", Builder = " << &Builder << "\n");
OpenMPIRBuilder::TargetDataRTArgs RTArgs;
OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
!MapInfo.Names.empty());
@@ -5632,7 +5707,10 @@ static void emitTargetCall(
// make task call
// }
//
- if (NewOMPIRBuilderTargetCodegen && RequiresOuterTargetTask) {
+
+ // The presence of certain clauses on the target directive require the explicit
+ // generation of the target task.
+ if (RequiresOuterTargetTask) {
OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
AllocaIP, Dependencies, HasNoWait);
@@ -5642,7 +5720,7 @@ static void emitTargetCall(
DeviceID, RTLoc, AllocaIP));
}
}
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
int32_t NumThreads, SmallVectorImpl<Value *> &Args,
@@ -5650,33 +5728,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::newCreateTarget(
OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
SmallVector<DependData> Dependencies) {
- if (!NewOMPIRBuilderTargetCodegen) {
- LLVM_DEBUG(dbgs() << "Old OpenMPIRBuilder target codegen\n");
- return createTarget(Loc, AllocaIP, CodeGenIP, EntryInfo, NumTeams,
- NumThreads, Args, GenMapInfoCB, CBFunc,
- ArgAccessorFuncCB);
- }
- LLVM_DEBUG(dbgs() << "New OpenMPIRBuilder target codegen\n");
- if (!updateToLocation(Loc))
- return InsertPointTy();
- Builder.restoreIP(CodeGenIP);
- Function *OutlinedFn;
- Constant *OutlinedFnID;
- emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
- OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
- if (!Config.isTargetDevice())
- emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
- NumThreads, Args, GenMapInfoCB, Dependencies);
- return Builder.saveIP();
-}
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
- const LocationDescription &Loc, InsertPointTy AllocaIP,
- InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
- int32_t NumThreads, SmallVectorImpl<Value *> &Args,
- GenMapInfoCallbackTy GenMapInfoCB,
- OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
- OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -5684,12 +5736,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
Function *OutlinedFn;
Constant *OutlinedFnID;
+ // The target region is outlined into its own function. The LLVM IR for
+ // the target region itself is generated using the callbacks CBFunc
+ // and ArgAccessorFuncCB
emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+
+ // If we are not on the target device, then we need to generate code
+ // to make a remote call (offload) to the previously outlined function
+ // that represents the target region. Do that now.
if (!Config.isTargetDevice())
emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
- NumThreads, Args, GenMapInfoCB);
-
+ NumThreads, Args, GenMapInfoCB, Dependencies);
return Builder.saveIP();
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 022ea3af7f58a..74f34e227d9f0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3097,7 +3097,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
moduleTranslation, dds);
- builder.restoreIP(moduleTranslation.getOpenMPBuilder()->newCreateTarget(
+ builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
dds));
>From 141dccc06f933a737b7991add8dc466e831e45e9 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:11:49 -0500
Subject: [PATCH 14/26] clang-format fixes
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 55 +++++++++++++----------
1 file changed, 31 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index da2bf360cc8fd..46e68e8e0e6ca 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5422,12 +5422,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// }
//
// We have to now do the following
- // (i) Make an offloading call to outlined_device_function using the OpenMP RTL
- // See 'kernel_launch_function' in the pseudo code below. This is emitted by
- // emitKernelLaunch
- // (ii) Create a task entry point function that calls kernel_launch_function and
- // is the entry point for the target task. See '@.omp_target_task_proxy_func
- // in the pseudocode below.
+ // (i) Make an offloading call to outlined_device_function using the OpenMP
+ // RTL. See 'kernel_launch_function' in the pseudo code below. This is
+ // emitted by emitKernelLaunch
+ // (ii) Create a task entry point function that calls kernel_launch_function
+ // and is the entry point for the target task. See
+ // '@.omp_target_task_proxy_func in the pseudocode below.
// (iii) Create a task with the task entry point created in (ii)
//
// That is we create the following
@@ -5441,36 +5441,42 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// %strucArg[0] = %.offload_baseptrs
// %strucArg[1] = %.offload_ptrs
// %strucArg[2] = %.offload_mappers
- // proxy_target_task = @__kmpc_omp_task_alloc(..., @.omp_target_task_proxy_func)
+ // proxy_target_task = @__kmpc_omp_task_alloc(...,
+ // @.omp_target_task_proxy_func)
// memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
- // dependencies_array = alloca [
+ // dependencies_array = ...
// ;; if nowait not present
// call @__kmpc_omp_wait_deps(..., dependencies_array)
// call @__kmpc_omp_task_begin_if0(...)
- // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr %proxy_target_task)
- // call @__kmpc_omp_task_complete_if0(...)
+ // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
+ // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
// }
//
- // define internal void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) {
+ // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
+ // ptr %task) {
// %structArg = alloca {ptr, ptr, ptr}
// %shared_data = load (getelementptr %task, 0, 0)
// mempcy(%structArg, %shared_data, sizeof(structArg))
// kernel_launch_function(%thread.id, %structArg)
// }
//
- // We need the proxy function because the signature of the task entry point expected
- // by kmpc_omp_task is always the same and will be different from that of the
- // kernel_launch function.
+ // We need the proxy function because the signature of the task entry point
+ // expected by kmpc_omp_task is always the same and will be different from
+ // that of the kernel_launch function.
//
- // kernel_launch_function is generated by emitKernelLaunch and has the always_inline
- // attribute.
- // void kernel_launch_function(thread_id, structArg) alwaysinline {
+ // kernel_launch_function is generated by emitKernelLaunch and has the
+ // always_inline attribute. void kernel_launch_function(thread_id,
+ // structArg)
+ // alwaysinline {
// %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
// offload_baseptrs = load(getelementptr structArg, 0, 0)
// offload_ptrs = load(getelementptr structArg, 0, 1)
// offload_mappers = load(getelementptr structArg, 0, 2)
- // ; setup kernel_args using offload_baseptrs, offload_ptrs and offload_mappers
- // call i32 @__tgt_target_kernel(..., outlined_device_function, ptr %kernel_args)
+ // ; setup kernel_args using offload_baseptrs, offload_ptrs and
+ // ; offload_mappers
+ // call i32 @__tgt_target_kernel(...,
+ // outlined_device_function,
+ // ptr %kernel_args)
// }
// void outlined_device_function(ptr a, ptr b, ptr c) {
// *a = *b + *c
@@ -5498,9 +5504,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Builder.restoreIP(TargetTaskBodyIP);
// emitKernelLaunch makes the necessary runtime call to offload the kernel.
- // We then outline all that code into a separate function ('kernel_launch_function' in
- // the pseudo code above). This function is then called by the target task proxy
- // function (see '@.omp_target_task_proxy_func' in the pseudo code above)
+ // We then outline all that code into a separate function
+ // ('kernel_launch_function' in the pseudo code above). This function is then
+ // called by the target task proxy function (see
+ // '@.omp_target_task_proxy_func' in the pseudo code above)
// "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, Args, DeviceID,
@@ -5708,8 +5715,8 @@ static void emitTargetCall(
// }
//
- // The presence of certain clauses on the target directive require the explicit
- // generation of the target task.
+ // The presence of certain clauses on the target directive require the
+ // explicit generation of the target task.
if (RequiresOuterTargetTask) {
OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
>From 6697f1e266a9ea78062237829ed8ec2c7725a36c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:13:29 -0500
Subject: [PATCH 15/26] remove commented out createFakeIntVal
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 36 -----------------------
1 file changed, 36 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 46e68e8e0e6ca..cf6fcdca9294f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -389,42 +389,6 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
ToBeDeleted.push(UseFakeVal);
return FakeVal;
}
-// // This function creates a fake integer value and a fake use for the integer
-// // value. It returns the fake value created. This is useful in modeling the
-// // extra arguments to the outlined functions.
-// Value *createFakeIntVal(IRBuilder<> &Builder,
-// OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
-// std::stack<Instruction *> &ToBeDeleted,
-// OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
-// const Twine &Name = "", bool AsPtr = true) {
-// Builder.restoreIP(OuterAllocaIP);
-// Instruction *FakeVal;
-// AllocaInst *FakeValAddr =
-// Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
-// ToBeDeleted.push(FakeValAddr);
-
-// if (AsPtr) {
-// FakeVal = FakeValAddr;
-// } else {
-// FakeVal =
-// Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
-// ToBeDeleted.push(FakeVal);
-// }
-
-// // Generate a fake use of this value
-// Builder.restoreIP(InnerAllocaIP);
-// Instruction *UseFakeVal;
-// if (AsPtr) {
-// UseFakeVal =
-// Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
-// } else {
-// UseFakeVal =
-// cast<BinaryOperator>(Builder.CreateAdd(FakeVal,
-// Builder.getInt32(10)));
-// }
-// ToBeDeleted.push(UseFakeVal);
-// return FakeVal;
-// }
//===----------------------------------------------------------------------===//
// OpenMPIRBuilderConfig
>From e75c8536219bd21cd7e653186b8520d2ea87c359 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 17:16:32 -0500
Subject: [PATCH 16/26] more cleanup
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf6fcdca9294f..f2ab1ad18e694 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5666,18 +5666,6 @@ static void emitTargetCall(
OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
NumTeamsVal, NumThreadsVal,
DynCGGroupMem, HasNoWait);
- // PDB: here you'll have to break the logic down to do the following
- // if (!requiresoutertask) {
- // Builder.restoreIP(OMPBuilder.emitKernelLaunch(
- // Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
- // DeviceID, RTLoc, AllocaIP));
- // else {
- // codegen_callback = codegen callback to create task logic which should be
- // received from openmptollvmirtranslation + emitkernellaunch
- // create_task(codegen_callback)
- // make task call
- // }
- //
// The presence of certain clauses on the target directive require the
// explicit generation of the target task.
>From 799751fa9aa71b0e9e75be60ecbdea6e6aba1200 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 30 May 2024 23:31:54 -0500
Subject: [PATCH 17/26] Fix dependencies when nowait is used on target
construct
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 36 ++++++++++++++++-------
1 file changed, 26 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f2ab1ad18e694..70dcb5adef08c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1703,6 +1703,10 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
static Value *
emitDepArray(OpenMPIRBuilder &OMPBuilder,
SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+ // Early return if we have no dependencies to process
+ if (!Dependencies.size())
+ return nullptr;
+
IRBuilderBase &Builder = OMPBuilder.Builder;
Type *DependInfo = OMPBuilder.DependInfo;
Module &M = OMPBuilder.M;
@@ -5561,16 +5565,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
SharedsSize);
}
- if (Dependencies.size()) {
- Value *DepArray = emitDepArray(*this, Dependencies);
- Function *TaskWaitFn =
- getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
- Builder.CreateCall(
- TaskWaitFn,
- {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
- ConstantInt::get(Builder.getInt32Ty(), 0),
- ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
- }
+
+ Value *DepArray = emitDepArray(*this, Dependencies);
// ---------------------------------------------------------------
// V5.2 13.8 target construct
@@ -5581,6 +5577,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// The above means that the lack of a nowait on the target construct
// translates to '#pragma omp task if(0)'
if (!HasNoWait) {
+ if (DepArray) {
+ Function *TaskWaitFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+ Builder.CreateCall(
+ TaskWaitFn,
+ {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
+ ConstantInt::get(Builder.getInt32Ty(), 0),
+ ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+ }
// Included task.
Function *TaskBeginFn =
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
@@ -5594,6 +5599,17 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
CI = Builder.CreateCall(ProxyFn, {ThreadID});
CI->setDebugLoc(StaleCI->getDebugLoc());
Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
+ } else if (DepArray) {
+ // HasNoWait - meaning the task may be deferred. Call
+ // __kmpc_omp_task_with_deps if there are dependencies,
+ // else call __kmpc_omp_task
+ Function *TaskFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
+ Builder.CreateCall(
+ TaskFn,
+ {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
+ DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
+ ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
} else {
// Emit the @__kmpc_omp_task runtime call to spawn the task
Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
>From 77e5753582539cba299ab1410c5b966bba92f63c Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 31 May 2024 10:18:39 -0500
Subject: [PATCH 18/26] Add comments for emitTargetTask and emitDepArray
---
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 16 ++++++++++++++++
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 6 ++++++
2 files changed, 22 insertions(+)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2ed130c87d40b..d028820f16c91 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1761,11 +1761,27 @@ class OpenMPIRBuilder {
const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
+
+ /// Generate a target-task for the target construct
+ ///
+ /// \param OutlinedFn The outlined device/target kernel function.
+ /// \param OutlinedFnID The ooulined function ID.
+ /// \param EmitTargetCallFallbackCB Call back function to generate host
+ /// fallback code.
+ /// \param Args Data structure holding information about the kernel arguments.
+ /// \param DeviceID Identifier for the device via the 'device' clause.
+ /// \param RTLoc Source location identifier
+ /// \param AllocaIP The insertion point to be used for alloca instructions.
+ /// \param Dependencies Vector of DependData objects holding information of
+ /// dependencies as specified by the 'depend' clause.
+ /// \param HasNoWait True if the target construct had 'nowait' on it, false
+ /// otherwise
InsertPointTy emitTargetTask(
Function *OutlinedFn, Value *OutlinedFnID,
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
+
/// Emit the arguments to be passed to the runtime library based on the
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
/// ForEndCall, emit map types to be passed for the end of the region instead
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 70dcb5adef08c..5e084a5acb6f2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1700,6 +1700,12 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
return;
emitTaskyieldImpl(Loc);
}
+
+// Processes the dependencies in Dependencies and does the following
+// - Allocates space on the stack of an array of DependInfo objects
+// - Populates each DependInfo object with relevant information of
+// the corresponding dependence.
+// - All code is inserted in the entry block of the current function.
static Value *
emitDepArray(OpenMPIRBuilder &OMPBuilder,
SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
>From 073d194e9800cea748b186b27950ff9862eaf5f3 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 31 May 2024 10:28:56 -0500
Subject: [PATCH 19/26] Fix comment in target-depend.f90
---
offload/test/offloading/fortran/target-depend.f90 | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index db58f2db6bbe9..1c5ab1efcfbdd 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -1,5 +1,5 @@
-! Offloading test checking interaction of fixed size
-! arrays with enter, exit and target
+! Offloading test checking the use of the depend clause on
+! the target construct
! REQUIRES: flang, amdgcn-amd-amdhsa
! UNSUPPORTED: nvptx64-nvidia-cuda
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
>From 29566b122a154739e2591ec6c6ac3ea487e63ac1 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 11 Jun 2024 11:53:36 -0500
Subject: [PATCH 20/26] Incorporate changes for review comments
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 190 ++++++++++--------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 3 +-
.../test/offloading/fortran/target-depend.f90 | 22 +-
3 files changed, 121 insertions(+), 94 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5e084a5acb6f2..7a62f00952640 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -792,9 +792,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
if (!OffloadInfoManager.empty())
createOffloadEntriesAndInfoMetadata(ErrorReportFn);
-
- LLVM_DEBUG(dbgs() << "Module after OMPIRBuilder::finalize\n");
- LLVM_DEBUG(dbgs() << M << "\n");
}
OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -1707,55 +1704,65 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
// the corresponding dependence.
// - All code is inserted in the entry block of the current function.
static Value *
-emitDepArray(OpenMPIRBuilder &OMPBuilder,
- SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
+emitTaskDependencies(OpenMPIRBuilder &OMPBuilder,
+ SmallVector<OpenMPIRBuilder::DependData> &Dependencies) {
// Early return if we have no dependencies to process
- if (!Dependencies.size())
+ if (Dependencies.empty())
return nullptr;
+ // Given a vector of DependData objects, in this function we create an
+ // array on the stack that holds kmp_dep_info objects corresponding
+ // to each dependency. This is then passed to the OpenMP runtime.
+ // For example, if there are 'n' dependencies then the following psedo
+ // code is generated. Assume the first dependence is on a variable 'a'
+ //
+ // \code{c}
+ // DepArray = alloc(n x sizeof(kmp_depend_info);
+ // idx = 0;
+ // DepArray[idx].base_addr = ptrtoint(&a);
+ // DepArray[idx].len = 8;
+ // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
+ // ++idx;
+ // DepArray[idx].base_addr = ...;
+ // \endcode
+
IRBuilderBase &Builder = OMPBuilder.Builder;
Type *DependInfo = OMPBuilder.DependInfo;
Module &M = OMPBuilder.M;
Value *DepArray = nullptr;
- if (Dependencies.size()) {
- OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
- Builder.SetInsertPoint(
- &OldIP.getBlock()->getParent()->getEntryBlock().back());
-
- Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
- DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
-
- unsigned P = 0;
- for (const OpenMPIRBuilder::DependData &Dep : Dependencies) {
- Value *Base =
- Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
- // Store the pointer to the variable
- Value *Addr = Builder.CreateStructGEP(
- DependInfo, Base,
- static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
- Value *DepValPtr =
- Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
- Builder.CreateStore(DepValPtr, Addr);
- // Store the size of the variable
- Value *Size = Builder.CreateStructGEP(
- DependInfo, Base,
- static_cast<unsigned int>(RTLDependInfoFields::Len));
- Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize(
- Dep.DepValueType)),
- Size);
- // Store the dependency kind
- Value *Flags = Builder.CreateStructGEP(
- DependInfo, Base,
- static_cast<unsigned int>(RTLDependInfoFields::Flags));
- Builder.CreateStore(
- ConstantInt::get(Builder.getInt8Ty(),
- static_cast<unsigned int>(Dep.DepKind)),
- Flags);
- ++P;
- }
- Builder.restoreIP(OldIP);
- }
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ Builder.SetInsertPoint(
+ OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
+
+ Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
+ DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
+
+ for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
+ Value *Base =
+ Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
+ // Store the pointer to the variable
+ Value *Addr = Builder.CreateStructGEP(
+ DependInfo, Base,
+ static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
+ Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
+ Builder.CreateStore(DepValPtr, Addr);
+ // Store the size of the variable
+ Value *Size = Builder.CreateStructGEP(
+ DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
+ Builder.CreateStore(
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
+ Size);
+ // Store the dependency kind
+ Value *Flags = Builder.CreateStructGEP(
+ DependInfo, Base,
+ static_cast<unsigned int>(RTLDependInfoFields::Flags));
+ Builder.CreateStore(
+ ConstantInt::get(Builder.getInt8Ty(),
+ static_cast<unsigned int>(Dep.DepKind)),
+ Flags);
+ }
+ Builder.restoreIP(OldIP);
return DepArray;
}
@@ -5273,19 +5280,37 @@ static Function *createOutlinedFunction(
return Func;
}
-// Create an entry point for a target task with the following.
-// It'll have the following signature
-// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
-// This function is called from emitTargetTask once the
-// code to launch the target kernel has been outlined already.
-static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
- IRBuilderBase &Builder,
- CallInst *StaleCI) {
+/// Create an entry point for a target task with the following.
+/// It'll have the following signature
+/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
+/// This function is called from emitTargetTask once the
+/// code to launch the target kernel has been outlined already.
+static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
+ IRBuilderBase &Builder,
+ CallInst *StaleCI) {
Module &M = OMPBuilder.M;
- // CalledFunction is the target launch function, i.e.
+ // KernelLaunchFunction is the target launch function, i.e.
// the function that sets up kernel arguments and calls
// __tgt_target_kernel to launch the kernel on the device.
- Function *CalledFunction = StaleCI->getCalledFunction();
+ //
+ Function *KernelLaunchFunction = StaleCI->getCalledFunction();
+
+ // StaleCI is the CallInst which is the call to the outlined
+ // target kernel launch function. If there are values that the
+ // outlined function uses then these are aggregated into a structure
+ // which is passed as the second argument. If not, then there's
+ // only one argument, the threadID. So, StaleCI can be
+ //
+ // %structArg = alloca { ptr, ptr }, align 8
+ // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
+ // store ptr %20, ptr %gep_, align 8
+ // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
+ // store ptr %21, ptr %gep_8, align 8
+ // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
+ //
+ // OR
+ //
+ // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
StaleCI->getIterator());
LLVMContext &Ctx = StaleCI->getParent()->getContext();
@@ -5298,6 +5323,8 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
".omp_target_task_proxy_func",
Builder.GetInsertBlock()->getModule());
+ ProxyFn->getArg(0)->setName("thread.id");
+ ProxyFn->getArg(1)->setName("task");
BasicBlock *EntryBB =
BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
@@ -5311,20 +5338,17 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
"StaleCI with shareds should have exactly two arguments.");
if (HasShareds) {
- AllocaInst *ArgStructAlloca =
- dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
assert(ArgStructAlloca &&
"Unable to find the alloca instruction corresponding to arguments "
"for extracted function");
- StructType *ArgStructType =
+ auto *ArgStructType =
dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
- LLVM_DEBUG(dbgs() << "ArgStructType = " << *ArgStructType << "\n");
AllocaInst *NewArgStructAlloca =
Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
Value *TaskT = ProxyFn->getArg(1);
Value *ThreadId = ProxyFn->getArg(0);
- LLVM_DEBUG(dbgs() << "TaskT = " << *TaskT << "\n");
Value *SharedsSize =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
@@ -5332,16 +5356,12 @@ static Function *emitProxyTaskFunction(OpenMPIRBuilder &OMPBuilder,
LoadInst *LoadShared =
Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
- // TODO: Are these alignment values correct?
Builder.CreateMemCpy(
- NewArgStructAlloca,
- NewArgStructAlloca->getPointerAlignment(M.getDataLayout()), LoadShared,
+ NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
- Builder.CreateCall(CalledFunction, {ThreadId, NewArgStructAlloca});
+ Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
}
- ProxyFn->getArg(0)->setName("thread.id");
- ProxyFn->getArg(1)->setName("task");
Builder.CreateRetVoid();
return ProxyFn;
}
@@ -5439,9 +5459,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// that of the kernel_launch function.
//
// kernel_launch_function is generated by emitKernelLaunch and has the
- // always_inline attribute. void kernel_launch_function(thread_id,
- // structArg)
- // alwaysinline {
+ // always_inline attribute.
+ // void kernel_launch_function(thread_id,
+ // structArg) alwaysinline {
// %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
// offload_baseptrs = load(getelementptr structArg, 0, 0)
// offload_ptrs = load(getelementptr structArg, 0, 1)
@@ -5482,7 +5502,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// ('kernel_launch_function' in the pseudo code above). This function is then
// called by the target task proxy function (see
// '@.omp_target_task_proxy_func' in the pseudo code above)
- // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
+ // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction
Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
EmitTargetCallFallbackCB, Args, DeviceID,
RTLoc, TargetTaskAllocaIP));
@@ -5496,20 +5516,14 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
bool HasShareds = StaleCI->arg_size() > 1;
- LLVM_DEBUG(dbgs() << "StaleCI in PostOutlineCB in emitTargetTask = "
- << *StaleCI << "\n");
- LLVM_DEBUG(dbgs() << "Module in PostOutlineCB in emitTargetTask = "
- << *(StaleCI->getParent()->getParent()->getParent())
- << "\n");
-
- Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+ Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
<< "\n");
Builder.SetInsertPoint(StaleCI);
- // Gather the arguments for emitting the runtime call for
+ // Gather the arguments for emitting the runtime call.
uint32_t SrcLocStrSize;
Constant *SrcLocStr =
getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
@@ -5527,20 +5541,19 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
// Tasksize refers to the size in bytes of kmp_task_t data structure
// including private vars accessed in task.
// TODO: add kmp_task_t_with_privates (privates)
- Value *TaskSize = Builder.getInt64(
- divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
+ Value *TaskSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
// Argument - `sizeof_shareds` (SharedsSize)
// SharedsSize refers to the shareds array size in the kmp_task_t data
// structure.
Value *SharedsSize = Builder.getInt64(0);
if (HasShareds) {
- AllocaInst *ArgStructAlloca =
- dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
assert(ArgStructAlloca &&
"Unable to find the alloca instruction corresponding to arguments "
"for extracted function");
- StructType *ArgStructType =
+ auto *ArgStructType =
dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
assert(ArgStructType && "Unable to find struct type corresponding to "
"arguments for extracted function");
@@ -5572,7 +5585,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
SharedsSize);
}
- Value *DepArray = emitDepArray(*this, Dependencies);
+ Value *DepArray = emitTaskDependencies(*this, Dependencies);
// ---------------------------------------------------------------
// V5.2 13.8 target construct
@@ -5588,8 +5601,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
Builder.CreateCall(
TaskWaitFn,
- {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
- ConstantInt::get(Builder.getInt32Ty(), 0),
+ {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
+ /*ndeps=*/Builder.getInt32(Dependencies.size()),
+ /*dep_list=*/DepArray,
+ /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
+ /*noalias_dep_list=*/
ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
}
// Included task.
@@ -5660,8 +5676,6 @@ static void emitTargetCall(
// emitKernelLaunch
auto &&EmitTargetCallFallbackCB =
[&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy {
- LLVM_DEBUG(dbgs() << "EmitTargetCallFallbackCB::Builder = " << &Builder
- << "\n");
Builder.restoreIP(IP);
Builder.CreateCall(OutlinedFn, Args);
return Builder.saveIP();
@@ -5999,8 +6013,6 @@ void OpenMPIRBuilder::emitOffloadingArrays(
return;
Builder.restoreIP(AllocaIP);
- LLVM_DEBUG(dbgs() << "Basicblock before emitOffloadingArrays\n"
- << *(Builder.GetInsertBlock()) << "\n");
// Detect if we have any capture size requiring runtime evaluation of the
// size so that a constant array could be eventually used.
ArrayType *PointerArrayType =
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 74f34e227d9f0..e324730c39a17 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -681,10 +681,11 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
ompLoc, bodyCB, numTeamsLower, numTeamsUpper, threadLimit, ifExpr));
return bodyGenStatus;
}
+
static void
buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
LLVM::ModuleTranslation &moduleTranslation,
- SmallVector<llvm::OpenMPIRBuilder::DependData> &dds) {
+ SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
for (auto dep : llvm::zip(dependVars, depends->getValue())) {
llvm::omp::RTLDependenceKindTy type;
switch (
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index 1c5ab1efcfbdd..cd256fa3f2164 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -13,25 +13,39 @@ program main
integer :: a = 0
call foo(5, a)
print*, "======= FORTRAN Test passed! ======="
- print*, "foo(5) returned ", a, ", expected 6\n"
+ print*, "foo(5) returned ", a, ", expected 8\n"
! stop 0
end program main
subroutine foo(N, r)
integer, intent(in) :: N
integer, intent(out) :: r
- integer :: z
-
+ integer :: z, i
z = 1
+ ! Spawn 3 threads
+ !$omp parallel num_threads(3)
+
+ ! Each thread redundantly updates z to N
+ ! i.e. 5
!$omp task depend(out: z) shared(z)
+ do while (i < 32766)
+ ! dumb loop to slow down the update of
+ ! z
+ i = i + 1
+ end do
z = N
!$omp end task
+ ! z is 5 now. Each thread then offloads
+ ! increment of z by 1. So, z is incremented
+ ! three times.
!$omp target map(tofrom: z) depend(in: z)
z = z + 1
!$omp end target
+ !$omp end parallel
+ ! z is 8.
r = z
end subroutine foo
!CHECK: ======= FORTRAN Test passed! =======
-!CHECK: foo(5) returned 6 , expected 6
+!CHECK: foo(5) returned 8 , expected 8
>From a12ffb476c8f28a975d3e565df96bdb14f5ae961 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 21 Jun 2024 00:02:02 -0500
Subject: [PATCH 21/26] Update testcase to make it more reliable by correcting
the use of the depend clause (sibling tasks)
---
.../test/offloading/fortran/target-depend.f90 | 92 +++++++++++++------
1 file changed, 62 insertions(+), 30 deletions(-)
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index cd256fa3f2164..e7729a8c31e6b 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -10,42 +10,74 @@
! RUN: %libomptarget-compile-fortran-run-and-check-generic
program main
+ implicit none
integer :: a = 0
+ INTERFACE
+ FUNCTION omp_get_device_num() BIND(C)
+ USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
+ integer :: omp_get_device_num
+ END FUNCTION omp_get_device_num
+ END INTERFACE
+
call foo(5, a)
print*, "======= FORTRAN Test passed! ======="
- print*, "foo(5) returned ", a, ", expected 8\n"
- ! stop 0
-end program main
-subroutine foo(N, r)
- integer, intent(in) :: N
- integer, intent(out) :: r
- integer :: z, i
- z = 1
- ! Spawn 3 threads
- !$omp parallel num_threads(3)
+ print*, "foo(5) returned ", a, ", expected 6\n"
- ! Each thread redundantly updates z to N
- ! i.e. 5
- !$omp task depend(out: z) shared(z)
- do while (i < 32766)
- ! dumb loop to slow down the update of
- ! z
- i = i + 1
- end do
- z = N
- !$omp end task
+ ! stop 0
+ contains
+ subroutine foo(N, r)
+ integer, intent(in) :: N
+ integer, intent(out) :: r
+ integer :: z, i, j, k, accumulator
+ z = 1
+ accumulator = 0
+ ! Spawn 3 threads
+ !$omp parallel num_threads(3)
- ! z is 5 now. Each thread then offloads
- ! increment of z by 1. So, z is incremented
- ! three times.
- !$omp target map(tofrom: z) depend(in: z)
- z = z + 1
- !$omp end target
- !$omp end parallel
+ ! A single thread will then create two tasks
+ ! One is the 'producer' and potentially slower
+ ! task that updates 'z' to 'N'. The second is an
+ ! offloaded target task that increments 'z'.
+ ! If the depend clauses work properly, the
+ ! target task should wait for the 'producer'
+ ! task to complete before incrementing z
+ ! We use !$omp single here because only
+ ! the depend clause establishes dependencies
+ ! between sibling tasks only. This is the easiest
+ ! way of creating two sibling tasks.
+ !$omp single
+ !$omp task depend(out: z) shared(z)
+ do while (k < 32000)
+ do while (j < 32766)
+ do while (i < 32766)
+ ! dumb loop nest to slow down the update of
+ ! z
+ i = i + 1
+ ! Adding a function call slows down the producer
+ ! to the point that removing the depend clause
+ ! from the target construct below frequently
+ ! results in the wrong answer.
+ accumulator = accumulator + omp_get_device_num()
+ end do
+ j = j +1
+ end do
+ k = k + 1
+ end do
+ z = N
+ !$omp end task
- ! z is 8.
- r = z
+ ! z is 5 now. Increment z to 6.
+ !$omp target map(tofrom: z) depend(in:z)
+ z = z + 1
+ !$omp end target
+ !$omp end single
+ !$omp end parallel
+ ! Use 'accumulator' so it is not optimized away
+ ! by the compiler.
+ print *, accumulator
+ r = z
end subroutine foo
!CHECK: ======= FORTRAN Test passed! =======
-!CHECK: foo(5) returned 8 , expected 8
+!CHECK: foo(5) returned 6 , expected 6
+end program main
>From ac34fd18fc503034de96a9eb42b74ce580d61ca4 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Mon, 1 Jul 2024 17:10:06 -0500
Subject: [PATCH 22/26] Address more review comments
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 6 +-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 59 +++++++++++--------
.../test/offloading/fortran/target-depend.f90 | 42 +++++--------
3 files changed, 52 insertions(+), 55 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7a62f00952640..6de2318c23659 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5706,9 +5706,9 @@ static void emitTargetCall(
// The presence of certain clauses on the target directive require the
// explicit generation of the target task.
if (RequiresOuterTargetTask) {
- OMPBuilder.emitTargetTask(OutlinedFn, OutlinedFnID,
- EmitTargetCallFallbackCB, KArgs, DeviceID, RTLoc,
- AllocaIP, Dependencies, HasNoWait);
+ Builder.restoreIP(OMPBuilder.emitTargetTask(
+ OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
+ RTLoc, AllocaIP, Dependencies, HasNoWait));
} else {
Builder.restoreIP(OMPBuilder.emitKernelLaunch(
Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e324730c39a17..6b6679c8cfecd 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -683,27 +683,38 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
}
static void
-buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
- LLVM::ModuleTranslation &moduleTranslation,
+buildDependData(Operation *op, LLVM::ModuleTranslation &moduleTranslation,
SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
- for (auto dep : llvm::zip(dependVars, depends->getValue())) {
- llvm::omp::RTLDependenceKindTy type;
- switch (
- cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
- case mlir::omp::ClauseTaskDepend::taskdependin:
- type = llvm::omp::RTLDependenceKindTy::DepIn;
- break;
- // The OpenMP runtime requires that the codegen for 'depend' clause for
- // 'out' dependency kind must be the same as codegen for 'depend' clause
- // with 'inout' dependency.
- case mlir::omp::ClauseTaskDepend::taskdependout:
- case mlir::omp::ClauseTaskDepend::taskdependinout:
- type = llvm::omp::RTLDependenceKindTy::DepInOut;
- break;
- };
- llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
- llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
- dds.emplace_back(dd);
+ auto processDepends = [&](std::optional<ArrayAttr> depends,
+ OperandRange dependVars) {
+ if (dependVars.empty())
+ return;
+ for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+ llvm::omp::RTLDependenceKindTy type;
+ switch (
+ cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+ case mlir::omp::ClauseTaskDepend::taskdependin:
+ type = llvm::omp::RTLDependenceKindTy::DepIn;
+ break;
+ // The OpenMP runtime requires that the codegen for 'depend' clause for
+ // 'out' dependency kind must be the same as codegen for 'depend' clause
+ // with 'inout' dependency.
+ case mlir::omp::ClauseTaskDepend::taskdependout:
+ case mlir::omp::ClauseTaskDepend::taskdependinout:
+ type = llvm::omp::RTLDependenceKindTy::DepInOut;
+ break;
+ };
+ llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+ llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+ dds.emplace_back(dd);
+ }
+ };
+
+ if (auto taskOp = dyn_cast<omp::TaskOp>(op)) {
+ processDepends(taskOp.getDepends(), taskOp.getDependVars());
+ }
+ if (auto targetOp = dyn_cast<omp::TargetOp>(op)) {
+ processDepends(targetOp.getDepends(), targetOp.getDependVars());
}
}
/// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
@@ -729,9 +740,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
};
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
- if (!taskOp.getDependVars().empty() && taskOp.getDepends())
- buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
- moduleTranslation, dds);
+ buildDependData(taskOp.getOperation(), moduleTranslation, dds);
llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
@@ -3094,9 +3103,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
kernelInput.push_back(mapData.OriginalValue[i]);
}
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
- if (!targetOp.getDependVars().empty() && targetOp.getDepends())
- buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
- moduleTranslation, dds);
+ buildDependData(targetOp.getOperation(), moduleTranslation, dds);
builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index e7729a8c31e6b..81e1770465a42 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -34,34 +34,25 @@ subroutine foo(N, r)
! Spawn 3 threads
!$omp parallel num_threads(3)
- ! A single thread will then create two tasks
- ! One is the 'producer' and potentially slower
- ! task that updates 'z' to 'N'. The second is an
- ! offloaded target task that increments 'z'.
- ! If the depend clauses work properly, the
- ! target task should wait for the 'producer'
- ! task to complete before incrementing z
- ! We use !$omp single here because only
- ! the depend clause establishes dependencies
- ! between sibling tasks only. This is the easiest
- ! way of creating two sibling tasks.
+ ! A single thread will then create two tasks - one is the 'producer' and
+ ! potentially slower task that updates 'z' to 'N'. The second is an
+ ! offloaded target task that increments 'z'. If the depend clauses work
+ ! properly, the target task should wait for the 'producer' task to
+ ! complete before incrementing 'z'. We use 'omp single' here because the
+ ! depend clause establishes dependencies between sibling tasks only.
+ ! This is the easiest way of creating two sibling tasks.
!$omp single
!$omp task depend(out: z) shared(z)
- do while (k < 32000)
- do while (j < 32766)
- do while (i < 32766)
- ! dumb loop nest to slow down the update of
- ! z
- i = i + 1
- ! Adding a function call slows down the producer
- ! to the point that removing the depend clause
- ! from the target construct below frequently
- ! results in the wrong answer.
+ do k=1, 32766
+ do j=1, 32766
+ do i = 1, 32766
+ ! dumb loop nest to slow down the update of 'z'.
+ ! Adding a function call slows down the producer to the point
+ ! that removing the depend clause from the target construct below
+ ! frequently results in the wrong answer.
accumulator = accumulator + omp_get_device_num()
end do
- j = j +1
end do
- k = k + 1
end do
z = N
!$omp end task
@@ -72,11 +63,10 @@ subroutine foo(N, r)
!$omp end target
!$omp end single
!$omp end parallel
- ! Use 'accumulator' so it is not optimized away
- ! by the compiler.
+ ! Use 'accumulator' so it is not optimized away by the compiler.
print *, accumulator
r = z
-end subroutine foo
+ end subroutine foo
!CHECK: ======= FORTRAN Test passed! =======
!CHECK: foo(5) returned 6 , expected 6
>From c0e2ceb0d88e564ec50fc96df6869888e4d133d9 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 2 Jul 2024 08:51:46 -0500
Subject: [PATCH 23/26] Incorporate one more review comment
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6de2318c23659..b045f6c6961a3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5481,10 +5481,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
BasicBlock *TargetTaskAllocaBB =
splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
- InsertPointTy TargetTaskAllocaIP =
- InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
- InsertPointTy TargetTaskBodyIP =
- InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+ InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
+ TargetTaskAllocaBB->begin());
+ InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
OutlineInfo OI;
OI.EntryBB = TargetTaskAllocaBB;
>From 9b49c09084aee2fed711e30d4a1f0410c69e0095 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Tue, 9 Jul 2024 01:09:06 -0500
Subject: [PATCH 24/26] Incorporate changes from review comments
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 59 ++++++++-----------
1 file changed, 26 insertions(+), 33 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6b6679c8cfecd..391bbacc2f6cd 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -683,38 +683,29 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
}
static void
-buildDependData(Operation *op, LLVM::ModuleTranslation &moduleTranslation,
+buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+ LLVM::ModuleTranslation &moduleTranslation,
SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
- auto processDepends = [&](std::optional<ArrayAttr> depends,
- OperandRange dependVars) {
- if (dependVars.empty())
- return;
- for (auto dep : llvm::zip(dependVars, depends->getValue())) {
- llvm::omp::RTLDependenceKindTy type;
- switch (
- cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
- case mlir::omp::ClauseTaskDepend::taskdependin:
- type = llvm::omp::RTLDependenceKindTy::DepIn;
- break;
- // The OpenMP runtime requires that the codegen for 'depend' clause for
- // 'out' dependency kind must be the same as codegen for 'depend' clause
- // with 'inout' dependency.
- case mlir::omp::ClauseTaskDepend::taskdependout:
- case mlir::omp::ClauseTaskDepend::taskdependinout:
- type = llvm::omp::RTLDependenceKindTy::DepInOut;
- break;
- };
- llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
- llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
- dds.emplace_back(dd);
- }
- };
-
- if (auto taskOp = dyn_cast<omp::TaskOp>(op)) {
- processDepends(taskOp.getDepends(), taskOp.getDependVars());
- }
- if (auto targetOp = dyn_cast<omp::TargetOp>(op)) {
- processDepends(targetOp.getDepends(), targetOp.getDependVars());
+ if (dependVars.empty())
+ return;
+ for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+ llvm::omp::RTLDependenceKindTy type;
+ switch (
+ cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+ case mlir::omp::ClauseTaskDepend::taskdependin:
+ type = llvm::omp::RTLDependenceKindTy::DepIn;
+ break;
+ // The OpenMP runtime requires that the codegen for 'depend' clause for
+ // 'out' dependency kind must be the same as codegen for 'depend' clause
+ // with 'inout' dependency.
+ case mlir::omp::ClauseTaskDepend::taskdependout:
+ case mlir::omp::ClauseTaskDepend::taskdependinout:
+ type = llvm::omp::RTLDependenceKindTy::DepInOut;
+ break;
+ };
+ llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+ llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+ dds.emplace_back(dd);
}
}
/// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
@@ -740,7 +731,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
};
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
- buildDependData(taskOp.getOperation(), moduleTranslation, dds);
+ buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+ moduleTranslation, dds);
llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
@@ -3103,7 +3095,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
kernelInput.push_back(mapData.OriginalValue[i]);
}
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
- buildDependData(targetOp.getOperation(), moduleTranslation, dds);
+ buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+ moduleTranslation, dds);
builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
>From cda5bc6c8f3296b768c1bfc3e364b2118c1e6260 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 19 Jul 2024 00:17:40 -0500
Subject: [PATCH 25/26] Simplify test to make it faster yet effective
---
.../test/offloading/fortran/target-depend.f90 | 18 +++++++-----------
1 file changed, 7 insertions(+), 11 deletions(-)
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
index 81e1770465a42..928eb671c9706 100644
--- a/offload/test/offloading/fortran/target-depend.f90
+++ b/offload/test/offloading/fortran/target-depend.f90
@@ -28,7 +28,7 @@ END FUNCTION omp_get_device_num
subroutine foo(N, r)
integer, intent(in) :: N
integer, intent(out) :: r
- integer :: z, i, j, k, accumulator
+ integer :: z, i, accumulator
z = 1
accumulator = 0
! Spawn 3 threads
@@ -43,16 +43,12 @@ subroutine foo(N, r)
! This is the easiest way of creating two sibling tasks.
!$omp single
!$omp task depend(out: z) shared(z)
- do k=1, 32766
- do j=1, 32766
- do i = 1, 32766
- ! dumb loop nest to slow down the update of 'z'.
- ! Adding a function call slows down the producer to the point
- ! that removing the depend clause from the target construct below
- ! frequently results in the wrong answer.
- accumulator = accumulator + omp_get_device_num()
- end do
- end do
+ do i=1, 32766
+ ! dumb loop nest to slow down the update of 'z'.
+ ! Adding a function call slows down the producer to the point
+ ! that removing the depend clause from the target construct below
+ ! frequently results in the wrong answer.
+ accumulator = accumulator + omp_get_device_num()
end do
z = N
!$omp end task
>From a314a84964ae243cb4fee98a9adeacdfa9edf71e Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Fri, 19 Jul 2024 11:41:11 -0500
Subject: [PATCH 26/26] Use SmallVector instead of std::stack in
CreateFakeIntVal
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 35 ++++++++++-------------
1 file changed, 15 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b045f6c6961a3..ac053aca95403 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -359,21 +359,21 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
// extra arguments to the outlined functions.
Value *createFakeIntVal(IRBuilderBase &Builder,
OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
- std::stack<Instruction *> &ToBeDeleted,
+ llvm::SmallVectorImpl<Instruction *> &ToBeDeleted,
OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
const Twine &Name = "", bool AsPtr = true) {
Builder.restoreIP(OuterAllocaIP);
Instruction *FakeVal;
AllocaInst *FakeValAddr =
Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
- ToBeDeleted.push(FakeValAddr);
+ ToBeDeleted.push_back(FakeValAddr);
if (AsPtr) {
FakeVal = FakeValAddr;
} else {
FakeVal =
Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
- ToBeDeleted.push(FakeVal);
+ ToBeDeleted.push_back(FakeVal);
}
// Generate a fake use of this value
@@ -386,7 +386,7 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
UseFakeVal =
cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
}
- ToBeDeleted.push(UseFakeVal);
+ ToBeDeleted.push_back(UseFakeVal);
return FakeVal;
}
@@ -1810,7 +1810,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
OI.ExitBB = TaskExitBB;
// Add the thread ID argument.
- std::stack<Instruction *> ToBeDeleted;
+ SmallVector<Instruction *, 4> ToBeDeleted;
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
@@ -2007,10 +2007,8 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
}
- while (!ToBeDeleted.empty()) {
- ToBeDeleted.top()->eraseFromParent();
- ToBeDeleted.pop();
- }
+ llvm::for_each(llvm::reverse(ToBeDeleted),
+ [](Instruction *I) { I->eraseFromParent(); });
};
addOutlineInfo(std::move(OI));
@@ -5490,7 +5488,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
OI.OuterAllocaBB = AllocaIP.getBlock();
// Add the thread ID argument.
- std::stack<Instruction *> ToBeDeleted;
+ SmallVector<Instruction *, 4> ToBeDeleted;
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
@@ -5638,10 +5636,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
}
StaleCI->eraseFromParent();
- while (!ToBeDeleted.empty()) {
- ToBeDeleted.top()->eraseFromParent();
- ToBeDeleted.pop();
- }
+ llvm::for_each(llvm::reverse(ToBeDeleted),
+ [](Instruction *I) { I->eraseFromParent(); });
};
addOutlineInfo(std::move(OI));
@@ -6862,7 +6858,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
OI.OuterAllocaBB = &OuterAllocaBB;
// Insert fake values for global tid and bound tid.
- std::stack<Instruction *> ToBeDeleted;
+ SmallVector<Instruction *, 8> ToBeDeleted;
InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
@@ -6877,7 +6873,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
assert(OutlinedFn.getNumUses() == 1 &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- ToBeDeleted.push(StaleCI);
+ ToBeDeleted.push_back(StaleCI);
assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
"Outlined function must have two or three arguments only");
@@ -6901,10 +6897,9 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
Args);
- while (!ToBeDeleted.empty()) {
- ToBeDeleted.top()->eraseFromParent();
- ToBeDeleted.pop();
- }
+ llvm::for_each(llvm::reverse(ToBeDeleted),
+ [](Instruction *I) { I->eraseFromParent(); });
+
};
if (!Config.isTargetDevice())
More information about the Mlir-commits
mailing list