[clang] [llvm] OpenMP offload 'simd' directive (PR #91261)
Eric Wright via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 14 08:44:51 PDT 2024
https://github.com/efwright updated https://github.com/llvm/llvm-project/pull/91261
>From 61146e27d804e0fff3711c3defb7064384ccc332 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzansel61.coral.llnl.gov>
Date: Mon, 6 May 2024 12:20:44 -0700
Subject: [PATCH 1/2] OpenMP offload 'simd' directive
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 +
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 239 +-
clang/lib/CodeGen/CGStmtOpenMP.cpp | 187 +-
clang/lib/CodeGen/CodeGenFunction.cpp | 2 +-
clang/lib/Parse/ParseOpenMP.cpp | 6 +-
.../target_teams_generic_loop_codegen.cpp | 12 +-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 639 ++++-
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 12 +
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2121 ++++++++++++++++-
llvm/lib/Transforms/Utils/CodeExtractor.cpp | 13 +-
offload/DeviceRTL/include/Interface.h | 11 +
offload/DeviceRTL/include/Mapping.h | 7 +
offload/DeviceRTL/src/Kernel.cpp | 4 +-
offload/DeviceRTL/src/Mapping.cpp | 34 +
offload/DeviceRTL/src/Parallelism.cpp | 22 +-
offload/DeviceRTL/src/Reduction.cpp | 48 +
offload/DeviceRTL/src/State.cpp | 7 +-
offload/DeviceRTL/src/Synchronization.cpp | 4 +
offload/DeviceRTL/src/Workshare.cpp | 44 +
19 files changed, 3133 insertions(+), 281 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e39c7c58d2780..b515f7761ebc7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1033,6 +1033,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
: CGM(CGM), OMPBuilder(CGM.getModule()) {
+
KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
llvm::OpenMPIRBuilderConfig Config(
CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
@@ -1054,6 +1055,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
}
void CGOpenMPRuntime::clear() {
+
InternalVars.clear();
// Clean non-target variable declarations possibly used only in debug info.
for (const auto &Data : EmittedNonTargetVariables) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87496c8e488c6..bef637907b0ed 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -262,6 +262,7 @@ class CheckVarsEscapingDeclContext final
bool IsCombinedParallelRegion) {
if (!S)
return;
+
for (const CapturedStmt::Capture &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
@@ -336,13 +337,15 @@ class CheckVarsEscapingDeclContext final
return;
if (!D->hasAssociatedStmt())
return;
+
if (const auto *S =
dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
// Do not analyze directives that do not actually require capturing,
// like `omp for` or `omp simd` directives.
llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
- if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
+ if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown &&
+ D->getDirectiveKind() != OMPD_simd) {
VisitStmt(S->getCapturedStmt());
return;
}
@@ -502,23 +505,23 @@ class CheckVarsEscapingDeclContext final
} // anonymous namespace
/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
+///// We assume that the warp size is 32, which is always the case
+///// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+ llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+ return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}
-
+
/// Get the id of the current lane in the Warp.
/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+ llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
@@ -2812,9 +2815,11 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-#ifndef NDEBUG
+ bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
+ bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
+
+ ASTContext &C = CGM.getContext();
if (Options.SimpleReduction) {
assert(!TeamsReduction && !ParallelReduction &&
@@ -2824,155 +2829,78 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
}
- assert((TeamsReduction || ParallelReduction) &&
- "Invalid reduction selection in emitReduction.");
-
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
- llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
- int Cnt = 0;
- for (const Expr *DRE : Privates) {
- PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
- ++Cnt;
- }
-
- ASTContext &C = CGM.getContext();
- const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
- CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
-
- // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
- // RedList, shuffle_reduce_func, interwarp_copy_func);
- // or
- // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+ // Source location for theident struct
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *Res;
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- auto Size = RHSExprs.size();
- for (const Expr *E : Privates) {
- if (E->getType()->isVariablyModifiedType())
- // Reserve place for array size.
- ++Size;
- }
- llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
- QualType ReductionArrayTy = C.getConstantArrayType(
- C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
- /*IndexTypeQuals=*/0);
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- unsigned Idx = 0;
- for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- CGF.Builder.CreateStore(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
- Elem);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
- llvm::Function *ReductionFn = emitReductionFunction(
- CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
- Privates, LHSExprs, RHSExprs, ReductionOps);
- llvm::Value *ReductionDataSize =
- CGF.getTypeSize(C.getRecordType(ReductionRec));
- ReductionDataSize =
- CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
- llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
- CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
- llvm::Value *InterWarpCopyFn =
- emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
- if (ParallelReduction) {
- llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
- InterWarpCopyFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
- Args);
- } else {
- assert(TeamsReduction && "expected teams reduction.");
- TeamsReductions.push_back(ReductionRec);
- auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
- {}, "_openmp_teams_reductions_buffer_$_$ptr");
- llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
-
- llvm::Value *Args[] = {
- RTLoc,
- KernelTeamsReductionPtr,
- CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
- ReductionDataSize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn,
- GlobalToBufferCpyFn,
- GlobalToBufferRedFn,
- BufferToGlobalCpyFn,
- BufferToGlobalRedFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
- Args);
- }
-
- // 5. Build if (res == 1)
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
- llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
- Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
- CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
- // 6. Build then branch: where we have reduced values in the master
- // thread in each team.
- // __kmpc_end_reduce{_nowait}(<gtid>);
- // break;
- CGF.EmitBlock(ThenBB);
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+ CGF.AllocaInsertPt->getIterator());
+ InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+ llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
- // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
- auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
- this](CodeGenFunction &CGF, PrePostActionTy &Action) {
- auto IPriv = Privates.begin();
- auto ILHS = LHSExprs.begin();
- auto IRHS = RHSExprs.begin();
- for (const Expr *E : ReductionOps) {
- emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
- cast<DeclRefExpr>(*IRHS));
- ++IPriv;
- ++ILHS;
- ++IRHS;
+ CodeGenFunction::OMPPrivateScope Scope(CGF);
+ unsigned Idx = 0;
+ for (const Expr *Private : Privates) {
+ llvm::Type *ElementType;
+ llvm::Value *Variable;
+ llvm::Value *PrivateVariable;
+ llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+ ElementType = CGF.ConvertTypeForMem(Private->getType());
+ const auto *RHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+ PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getBasePointer();
+ const auto *LHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+ Variable = CGF.GetAddrOfLocalVar(LHSVar).getBasePointer();
+ llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+ switch (CGF.getEvaluationKind(Private->getType())) {
+ case TEK_Scalar:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+ break;
+ case TEK_Complex:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+ break;
+ case TEK_Aggregate:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+ break;
}
- };
- RegionCodeGenTy RCG(CodeGen);
- RCG(CGF);
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+ auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+ llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+ llvm::Function *NewFunc) {
+ CGF.Builder.restoreIP(CodeGenIP);
+ auto *CurFn = CGF.CurFn;
+ CGF.CurFn = NewFunc;
+
+ *LHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+ .getBasePointer();
+ *RHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+ .getBasePointer();
+
+ emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+ cast<DeclRefExpr>(LHSExprs[I]),
+ cast<DeclRefExpr>(RHSExprs[I]));
+
+ CGF.CurFn = CurFn;
+
+ return InsertPointTy(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ };
+ ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+ ElementType, Variable, PrivateVariable, EvalKind,
+ /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+ Idx++;
+ }
+
+ CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+ OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+ DistributeReduction, SimdReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+ CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+ RTLoc));
+ return;
}
const VarDecl *
@@ -3574,3 +3502,4 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
CGM.getModule(), OMPRTL___kmpc_get_warp_size),
Args);
}
+
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ef3aa3a8e0dc6..369e91d33de20 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1402,6 +1402,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
}
const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
+llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n";
EmitVarDecl(*VD);
EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD),
/*Volatile=*/false, TaskRedRef->getType());
@@ -1442,7 +1443,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
isOpenMPParallelDirective(D.getDirectiveKind()) ||
TeamsLoopCanBeParallel || ReductionKind == OMPD_simd;
- bool SimpleReduction = ReductionKind == OMPD_simd;
+ bool SimpleReduction = (CGM.getLangOpts().OpenMPIsTargetDevice ? false : ReductionKind == OMPD_simd);
// Emit nowait reduction if nowait clause is present or directive is a
// parallel directive (it always has implicit barrier).
CGM.getOpenMPRuntime().emitReduction(
@@ -2673,61 +2674,139 @@ GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
}
void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
- bool UseOMPIRBuilder =
- CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
- if (UseOMPIRBuilder) {
- auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF,
- PrePostActionTy &) {
- // Use the OpenMPIRBuilder if enabled.
- if (UseOMPIRBuilder) {
- llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars =
- GetAlignedMapping(S, CGF);
- // Emit the associated statement and get its loop representation.
- const Stmt *Inner = S.getRawStmt();
- llvm::CanonicalLoopInfo *CLI =
- EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
-
- llvm::OpenMPIRBuilder &OMPBuilder =
- CGM.getOpenMPRuntime().getOMPBuilder();
- // Add SIMD specific metadata
- llvm::ConstantInt *Simdlen = nullptr;
- if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
- RValue Len =
- this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
- /*ignoreResult=*/true);
- auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
- Simdlen = Val;
- }
- llvm::ConstantInt *Safelen = nullptr;
- if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
- RValue Len =
- this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
- /*ignoreResult=*/true);
- auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
- Safelen = Val;
- }
- llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
- if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
- if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) {
- Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
- }
- }
- // Add simd metadata to the collapsed loop. Do not generate
- // another loop for if clause. Support for if clause is done earlier.
- OMPBuilder.applySimd(CLI, AlignedVars,
- /*IfCond*/ nullptr, Order, Simdlen, Safelen);
- return;
- }
+ bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsTargetDevice;
+ if(UseOMPIRBuilder) {
+ auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt());
+ auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt());
+ CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
+
+ CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+ llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+ AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+
+ llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+ // Callback function for generating the trip count of the loop.
+ // This function should assign values to the TripCount and Signed variables
+ llvm::Value *LoopVar;
+ std::string LoopVarName;
+ EmittedClosureTy LoopVarClosure;
+
+ auto DistanceCB = [&](llvm::BasicBlock *AllocaBB,
+ InsertPointTy CodeGenIP) -> llvm::Value* {
+ InsertPointTy AllocaIP(AllocaBB, AllocaBB->getTerminator()->getIterator());
+ OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+ *this, AllocaIP, *(CodeGenIP.getBlock()));
+ Builder.restoreIP(CodeGenIP);
+
+ // Emit the loop variable, needed for the distance func
+ const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt());
+ if(const Stmt *InitStmt = For->getInit())
+ EmitStmt(InitStmt);
+
+ auto *LoopVarRef = CL->getLoopVarRef();
+ LValue LCVal = EmitLValue(LoopVarRef);
+ //Address LoopVarAddress = LCVal.getAddress(*this);
+ //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer());
+ LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this));
+ LoopVarName = LoopVarRef->getNameInfo().getAsString();
+
+ // Emit the distance func from the CanonicalLoop
+ const CapturedStmt *DistanceFunc = CL->getDistanceFunc();
+ EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+
+ // Load the output and store it in the TripCount
+ QualType LogicalTy = DistanceFunc->getCapturedDecl()
+ ->getParam(0)
+ ->getType()
+ .getNonReferenceType();
+
+ //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+ RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+
+ emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+ auto *TripCount = Builder.CreateLoad(CountAddr, ".count");
+
+ const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+ LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+ return TripCount;
};
- {
- auto LPCRegion =
- CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
- OMPLexicalScope Scope(*this, S, OMPD_unknown);
- CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd,
- CodeGenIRBuilder);
- }
+
+ auto FiniCB = [this](InsertPointTy IP) {
+ OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
+ };
+
+ auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) {
+ ReplVal = &Val;
+ return CodeGenIP;
+ };
+
+ auto BodyGenCB = [&]
+ (//InsertPointTy OuterAllocaIP,
+ llvm::BasicBlock *OuterAllocaBB,
+ InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ InsertPointTy Prolog, InsertPointTy ReductionEpilog,
+ llvm::Value *Virtual) {
+
+ llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+ InsertPointTy OuterAllocaIP(OuterAllocaBB, OuterAllocaBB->getTerminator()->getIterator());
+
+ OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+ *this, OuterAllocaIP, *(Prolog.getBlock()));
+ Builder.restoreIP(Prolog);
+
+ OMPPrivateScope PrivateScope(*this);
+ EmitOMPFirstprivateClause(S, PrivateScope);
+ EmitOMPPrivateClause(S, PrivateScope);
+ EmitOMPReductionClauseInit(S, PrivateScope);
+ PrivateScope.Privatize();
+
+ const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+
+ Builder.restoreIP(CodeGenIP);
+ emitCapturedStmtCall(*this, LoopVarClosure,
+ {LoopVar, Virtual});
+
+ // Generate the body of the loop
+ OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+ *this,
+ S.getBody(),
+ AllocaIP,
+ CodeGenIP,
+ "simd");
+
+ llvm::BasicBlock *RedEpilogBB = ReductionEpilog.getBlock();
+ llvm::Instruction *RedEpilogTerminator = RedEpilogBB->getTerminator();
+ llvm::BasicBlock *FinalBlock = RedEpilogBB->getSingleSuccessor();
+
+ Builder.restoreIP(ReductionEpilog);
+ EmitOMPReductionClauseFinal(S, OMPD_simd);
+
+ llvm::BasicBlock *ReductionThenBB = Builder.GetInsertBlock();
+
+ if(!(ReductionThenBB->getTerminator())) {
+ RedEpilogTerminator->eraseFromParent();
+ Builder.CreateBr(FinalBlock);
+ }
+
+ };
+
+ Builder.restoreIP(
+ OMPBuilder.createSimdLoop(
+ Builder,
+ AllocaIP,
+ BodyGenCB,
+ DistanceCB,
+ PrivCB,
+ FiniCB
+ ));
+
return;
- }
+ }
ParentLoopDirectiveForScanRegion ScanRegion(*this, S);
OMPFirstScanLoop = true;
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 87766a758311d..851a93af8bb83 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -102,7 +102,7 @@ CodeGenFunction::~CodeGenFunction() {
// seems to be a reasonable spot. We do it here, as opposed to the deletion
// time of the CodeGenModule, because we have to ensure the IR has not yet
// been "emitted" to the outside, thus, modifications are still sensible.
- if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+ if ((CGM.getLangOpts().OpenMPIsTargetDevice || CGM.getLangOpts().OpenMPIRBuilder) && CurFn)
CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn);
}
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 53d89ce2fa3e9..9bd78fa9351c8 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2974,10 +2974,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
Sema::CompoundScopeRAII Scope(Actions);
AssociatedStmt = ParseStatement();
- if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
- getLangOpts().OpenMPIRBuilder)
- AssociatedStmt =
- Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
+ if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind))
+ AssociatedStmt = Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
}
AssociatedStmt =
Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 3f752ac663f41..4194bdec549dd 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -85,6 +85,7 @@ int foo() {
// IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
@@ -102,7 +103,6 @@ int foo() {
// IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
@@ -258,6 +258,7 @@ int foo() {
// IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
@@ -275,7 +276,6 @@ int foo() {
// IR-GPU-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
@@ -399,12 +399,12 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -480,10 +480,10 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -544,12 +544,12 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -625,10 +625,10 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index c9ee0c25194c2..870e0edad19c8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -16,6 +16,7 @@
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Allocator.h"
@@ -99,7 +100,10 @@ class OpenMPIRBuilderConfig {
/// expanded.
std::optional<bool> IsGPU;
- // Flag for specifying if offloading is mandatory.
+ /// Flag for specifying if LLVMUsed information should be emitted.
+ std::optional<bool> EmitLLVMUsed;
+
+ /// Flag for specifying if offloading is mandatory.
std::optional<bool> OpenMPOffloadMandatory;
/// First separator used between the initial two parts of a name.
@@ -107,6 +111,9 @@ class OpenMPIRBuilderConfig {
/// Separator used between all of the rest consecutive parts of s name
std::optional<StringRef> Separator;
+ // Grid Value for the GPU target
+ std::optional<omp::GV> GridValue;
+
OpenMPIRBuilderConfig();
OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
bool OpenMPOffloadMandatory,
@@ -132,6 +139,11 @@ class OpenMPIRBuilderConfig {
return *OpenMPOffloadMandatory;
}
+ omp::GV getGridValue() const {
+ assert(GridValue.has_value() && "GridValue is not set");
+ return *GridValue;
+ }
+
bool hasRequiresFlags() const { return RequiresFlags; }
bool hasRequiresReverseOffload() const;
bool hasRequiresUnifiedAddress() const;
@@ -164,9 +176,11 @@ class OpenMPIRBuilderConfig {
void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
void setIsGPU(bool Value) { IsGPU = Value; }
+ void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsed = Value; }
void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
void setSeparator(StringRef S) { Separator = S; }
+ void setGridValue(omp::GV G) { GridValue = G; }
void setHasRequiresReverseOffload(bool Value);
void setHasRequiresUnifiedAddress(bool Value);
@@ -262,7 +276,7 @@ class OffloadEntriesInfoManager {
/// Return true if a there are no entries defined.
bool empty() const;
/// Return number of entries defined so far.
- unsigned size() const { return OffloadingEntriesNum; }
+ unsigned size() const { return OffloadingEntriesNum /*OffloadEntriesTargetRegion.size()*/ /*OffloadingEntriesNum*/; }
OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
@@ -470,6 +484,11 @@ class OpenMPIRBuilder {
/// all functions are finalized.
void finalize(Function *Fn = nullptr);
+ CallInst *globalizeAlloca(AllocaInst *Alloca, SmallVector<Instruction*, 32>&);
+ void globalizeParallelVars(Function *CurFn);
+ SmallPtrSet<Value*, 32> VarsNeedingGlobalization;
+ void globalizeVars(Function *CurFn);
+
/// Add attributes known for \p FnID to \p Fn.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
@@ -548,6 +567,18 @@ class OpenMPIRBuilder {
using BodyGenCallbackTy =
function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+ using LoopBodyCallbackTy =
+ function_ref<void(
+ BasicBlock *OuterAllocaBB, InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ InsertPointTy PrologIP, InsertPointTy ReductionEpilogIP,
+ Value *IterationNum
+ )>;
+
+ using TripCountCallbackTy =
+ function_ref<
+ Value*(llvm::BasicBlock *AllocaBB, InsertPointTy CodeGenIP)
+ >;
+
// This is created primarily for sections construct as llvm::function_ref
// (BodyGenCallbackTy) is not storable (as described in the comments of
// function_ref class - function_ref contains non-ownable reference
@@ -607,15 +638,17 @@ class OpenMPIRBuilder {
/// Generator for '#omp barrier'
///
/// \param Loc The location where the barrier directive was encountered.
- /// \param DK The kind of directive that caused the barrier.
+ /// \param Kind The kind of directive that caused the barrier.
/// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
/// \param CheckCancelFlag Flag to indicate a cancel barrier return value
/// should be checked and acted upon.
+ /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
///
/// \returns The insertion point after the barrier.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
bool ForceSimpleCall = false,
- bool CheckCancelFlag = true);
+ bool CheckCancelFlag = true,
+ Value *ThreadID = nullptr);
/// Generator for '#omp cancel'
///
@@ -627,6 +660,13 @@ class OpenMPIRBuilder {
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
omp::Directive CanceledDirective);
+ IRBuilder<>::InsertPoint
+ createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP,
+ LoopBodyCallbackTy BodyGenCB,
+ TripCountCallbackTy DistanceCB,
+ PrivatizeCallbackTy PrivCB,
+ FinalizeCallbackTy FiniCB);
+
/// Generator for '#omp parallel'
///
/// \param Loc The insert and source location description.
@@ -1235,27 +1275,56 @@ class OpenMPIRBuilder {
getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
StringRef ParentName = "");
- /// Functions used to generate reductions. Such functions take two Values
- /// representing LHS and RHS of the reduction, respectively, and a reference
- /// to the value that is updated to refer to the reduction result.
- using ReductionGenTy =
- function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+ /// Enum class for the RedctionGen CallBack type to be used.
+ enum class ReductionGenCBTy { Clang, MLIR };
+
+ /// ReductionGen CallBack for Clang
+ ///
+ /// \param CodeGenIP InsertPoint for CodeGen.
+ /// \param Index Index of the ReductionInfo to generate code for.
+ /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
+ /// codegen, used for fixup later.
+ /// \param RHSPtr Optionally used by Clang to
+ /// return the RHSPtr it used for codegen, used for fixup later.
+ /// \param CurFn Optionally used by Clang to pass in the Current Function as
+ /// Clang context may be old.
+ using ReductionGenCBClang =
+ std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
+ Value **LHS, Value **RHS, Function *CurFn)>;
+
+ /// ReductionGen CallBack for MLIR
+ ///
+ /// \param CodeGenIP InsertPoint for CodeGen.
+ /// \param LHS Pass in the LHS Value to be used for CodeGen.
+ /// \param RHS Pass in the RHS Value to be used for CodeGen.
+ using ReductionGenCB = std::function<InsertPointTy(
+ InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
/// Functions used to generate atomic reductions. Such functions take two
/// Values representing pointers to LHS and RHS of the reduction, as well as
/// the element type of these pointers. They are expected to atomically
/// update the LHS to the reduced value.
- using AtomicReductionGenTy =
- function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+ using AtomicReductionGenCB =
+ std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+ /// Enum class for reduction evaluation types scalar, complex and aggregate.
+ enum class EvaluationKindTy { Scalar, Complex, Aggregate };
/// Information about an OpenMP reduction.
struct ReductionInfo {
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
- ReductionGenTy ReductionGen,
- AtomicReductionGenTy AtomicReductionGen)
+ EvaluationKindTy EvaluationKind, ReductionGenCB ReductionGen,
+ ReductionGenCBClang ReductionGenClang,
+ AtomicReductionGenCB AtomicReductionGen)
: ElementType(ElementType), Variable(Variable),
- PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
+ PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
+ ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
AtomicReductionGen(AtomicReductionGen) {}
+ ReductionInfo(Value *PrivateVariable)
+ : ElementType(nullptr), Variable(nullptr),
+ PrivateVariable(PrivateVariable),
+ EvaluationKind(EvaluationKindTy::Scalar), ReductionGen(),
+ ReductionGenClang(), AtomicReductionGen() {}
/// Reduction element type, must match pointee type of variable.
Type *ElementType;
@@ -1266,18 +1335,543 @@ class OpenMPIRBuilder {
/// Thread-private partial reduction variable.
Value *PrivateVariable;
+ /// Reduction evaluation type - scalar, complex or aggregate.
+ EvaluationKindTy EvaluationKind;
+
/// Callback for generating the reduction body. The IR produced by this will
/// be used to combine two values in a thread-safe context, e.g., under
/// lock or within the same thread, and therefore need not be atomic.
- ReductionGenTy ReductionGen;
+ ReductionGenCB ReductionGen;
+
+ /// Clang callback for generating the reduction body. The IR produced by
+ /// this will be used to combine two values in a thread-safe context, e.g.,
+ /// under lock or within the same thread, and therefore need not be atomic.
+ ReductionGenCBClang ReductionGenClang;
/// Callback for generating the atomic reduction body, may be null. The IR
/// produced by this will be used to atomically combine two values during
/// reduction. If null, the implementation will use the non-atomic version
/// along with the appropriate synchronization mechanisms.
- AtomicReductionGenTy AtomicReductionGen;
+ AtomicReductionGenCB AtomicReductionGen;
};
+ enum class CopyAction : unsigned {
+ // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+ // the warp using shuffle instructions.
+ RemoteLaneToThread,
+ // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+ ThreadCopy,
+ };
+
+ struct CopyOptionsTy {
+ Value *RemoteLaneOffset = nullptr;
+ Value *ScratchpadIndex = nullptr;
+ Value *ScratchpadWidth = nullptr;
+ };
+
+ /// Supporting functions for Reductions CodeGen.
+private:
+ /// Emit the llvm.used metadata.
+ void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
+
+ /// Get the id of the current thread on the GPU.
+ Value *getGPUThreadID();
+
+ /// Get the GPU warp size.
+ Value *getGPUWarpSize();
+
+ /// Get the id of the warp in the block.
+ /// We assume that the warp size is 32, which is always the case
+ /// on the NVPTX device, to generate more efficient code.
+ Value *getNVPTXWarpID();
+
+ /// Get the id of the current lane in the Warp.
+ /// We assume that the warp size is 32, which is always the case
+ /// on the NVPTX device, to generate more efficient code.
+ Value *getNVPTXLaneID();
+
+ /// Cast value to the specified type.
+ Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
+
+ /// This function creates calls to one of two shuffle functions to copy
+ /// variables between lanes in a warp.
+ Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
+ Type *ElementType, Value *Offset);
+
+ void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
+ Type *ElementType, Value *Offset,
+ Type *ReductionArrayTy);
+
+ /// Emit instructions to copy a Reduce list, which contains partially
+ /// aggregated values, in the specified direction.
+ void emitReductionListCopy(
+ InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+ ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+ CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
+
+ /// Emit a helper that reduces data across two OpenMP threads (lanes)
+ /// in the same warp. It uses shuffle instructions to copy over data from
+ /// a remote lane's stack. The reduction algorithm performed is specified
+ /// by the fourth parameter.
+ ///
+ /// Algorithm Versions.
+ /// Full Warp Reduce (argument value 0):
+ /// This algorithm assumes that all 32 lanes are active and gathers
+ /// data from these 32 lanes, producing a single resultant value.
+ /// Contiguous Partial Warp Reduce (argument value 1):
+ /// This algorithm assumes that only a *contiguous* subset of lanes
+ /// are active. This happens for the last warp in a parallel region
+ /// when the user specified num_threads is not an integer multiple of
+ /// 32. This contiguous subset always starts with the zeroth lane.
+ /// Partial Warp Reduce (argument value 2):
+ /// This algorithm gathers data from any number of lanes at any position.
+ /// All reduced values are stored in the lowest possible lane. The set
+ /// of problems every algorithm addresses is a super set of those
+ /// addressable by algorithms with a lower version number. Overhead
+ /// increases as algorithm version increases.
+ ///
+ /// Terminology
+ /// Reduce element:
+ /// Reduce element refers to the individual data field with primitive
+ /// data types to be combined and reduced across threads.
+ /// Reduce list:
+ /// Reduce list refers to a collection of local, thread-private
+ /// reduce elements.
+ /// Remote Reduce list:
+ /// Remote Reduce list refers to a collection of remote (relative to
+ /// the current thread) reduce elements.
+ ///
+ /// We distinguish between three states of threads that are important to
+ /// the implementation of this function.
+ /// Alive threads:
+ /// Threads in a warp executing the SIMT instruction, as distinguished from
+ /// threads that are inactive due to divergent control flow.
+ /// Active threads:
+ /// The minimal set of threads that has to be alive upon entry to this
+ /// function. The computation is correct iff active threads are alive.
+ /// Some threads are alive but they are not active because they do not
+ /// contribute to the computation in any useful manner. Turning them off
+ /// may introduce control flow overheads without any tangible benefits.
+ /// Effective threads:
+ /// In order to comply with the argument requirements of the shuffle
+ /// function, we must keep all lanes holding data alive. But at most
+ /// half of them perform value aggregation; we refer to this half of
+ /// threads as effective. The other half is simply handing off their
+ /// data.
+ ///
+ /// Procedure
+ /// Value shuffle:
+ /// In this step active threads transfer data from higher lane positions
+ /// in the warp to lower lane positions, creating Remote Reduce list.
+ /// Value aggregation:
+ /// In this step, effective threads combine their thread local Reduce list
+ /// with Remote Reduce list and store the result in the thread local
+ /// Reduce list.
+ /// Value copy:
+ /// In this step, we deal with the assumption made by algorithm 2
+ /// (i.e. contiguity assumption). When we have an odd number of lanes
+ /// active, say 2k+1, only k threads will be effective and therefore k
+ /// new values will be produced. However, the Reduce list owned by the
+ /// (2k+1)th thread is ignored in the value aggregation. Therefore
+ /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
+ /// that the contiguity assumption still holds.
+ ///
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param ReduceFn The reduction function.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ShuffleAndReduce function.
+ Function *emitShuffleAndReduceFunction(
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ Function *ReduceFn, AttributeList FuncAttrs);
+
+ /// This function emits a helper that gathers Reduce lists from the first
+ /// lane of every active warp to lanes in the first warp.
+ ///
+ /// void inter_warp_copy_func(void* reduce_data, num_warps)
+ /// shared smem[warp_size];
+ /// For all data entries D in reduce_data:
+ /// sync
+ /// If (I am the first lane in each warp)
+ /// Copy my local D to smem[warp_id]
+ /// sync
+ /// if (I am the first warp)
+ /// Copy smem[thread_id] to my local D
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The InterWarpCopy function.
+ Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
+ ArrayRef<ReductionInfo> ReductionInfos,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that copies all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+ /// For all data entries D in reduce_data:
+ /// Copy local D to buffer.D[Idx]
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ListToGlobalCopy function.
+ Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that copies all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+ /// For all data entries D in reduce_data:
+ /// Copy buffer.D[Idx] to local D;
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The GlobalToList function.
+ Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that reduces all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+ /// void *GlobPtrs[];
+ /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+ /// ...
+ /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+ /// reduce_function(GlobPtrs, reduce_data);
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ListToGlobalReduce function.
+ Function *
+ emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Function *ReduceFn, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that reduces all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
+ /// void *GlobPtrs[];
+ /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+ /// ...
+ /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+ /// reduce_function(reduce_data, GlobPtrs);
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The GlobalToListReduce function.
+ Function *
+ emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Function *ReduceFn, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// Get the function name of a reduction function.
+ std::string getReductionFuncName(StringRef Name) const;
+
+ /// Emits reduction function.
+ /// \param ReducerName Name of the function calling the reduction.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param IsGpu Optional param to specify CodeGen for GPU Offloading.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The reduction function.
+ Function *createReductionFunction(
+ StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+ ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ AttributeList FuncAttrs = {});
+
+public:
+ ///
+ /// Design of OpenMP reductions on the GPU
+ ///
+ /// Consider a typical OpenMP program with one or more reduction
+ /// clauses:
+ ///
+ /// float foo;
+ /// double bar;
+ /// #pragma omp target teams distribute parallel for \
+ /// reduction(+:foo) reduction(*:bar)
+ /// for (int i = 0; i < N; i++) {
+ /// foo += A[i]; bar *= B[i];
+ /// }
+ ///
+ /// where 'foo' and 'bar' are reduced across all OpenMP threads in
+ /// all teams. In our OpenMP implementation on the NVPTX device an
+ /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
+ /// within a team are mapped to CUDA threads within a threadblock.
+ /// Our goal is to efficiently aggregate values across all OpenMP
+ /// threads such that:
+ ///
+ /// - the compiler and runtime are logically concise, and
+ /// - the reduction is performed efficiently in a hierarchical
+ /// manner as follows: within OpenMP threads in the same warp,
+ /// across warps in a threadblock, and finally across teams on
+ /// the NVPTX device.
+ ///
+ /// Introduction to Decoupling
+ ///
+ /// We would like to decouple the compiler and the runtime so that the
+ /// latter is ignorant of the reduction variables (number, data types)
+ /// and the reduction operators. This allows a simpler interface
+ /// and implementation while still attaining good performance.
+ ///
+ /// Pseudocode for the aforementioned OpenMP program generated by the
+ /// compiler is as follows:
+ ///
+ /// 1. Create private copies of reduction variables on each OpenMP
+ /// thread: 'foo_private', 'bar_private'
+ /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
+ /// to it and writes the result in 'foo_private' and 'bar_private'
+ /// respectively.
+ /// 3. Call the OpenMP runtime on the GPU to reduce within a team
+ /// and store the result on the team master:
+ ///
+ /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
+ /// reduceData, shuffleReduceFn, interWarpCpyFn)
+ ///
+ /// where:
+ /// struct ReduceData {
+ /// double *foo;
+ /// double *bar;
+ /// } reduceData
+ /// reduceData.foo = &foo_private
+ /// reduceData.bar = &bar_private
+ ///
+ /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
+ /// auxiliary functions generated by the compiler that operate on
+ /// variables of type 'ReduceData'. They aid the runtime perform
+ /// algorithmic steps in a data agnostic manner.
+ ///
+ /// 'shuffleReduceFn' is a pointer to a function that reduces data
+ /// of type 'ReduceData' across two OpenMP threads (lanes) in the
+ /// same warp. It takes the following arguments as input:
+ ///
+ /// a. variable of type 'ReduceData' on the calling lane,
+ /// b. its lane_id,
+ /// c. an offset relative to the current lane_id to generate a
+ /// remote_lane_id. The remote lane contains the second
+ /// variable of type 'ReduceData' that is to be reduced.
+ /// d. an algorithm version parameter determining which reduction
+ /// algorithm to use.
+ ///
+ /// 'shuffleReduceFn' retrieves data from the remote lane using
+ /// efficient GPU shuffle intrinsics and reduces, using the
+ /// algorithm specified by the 4th parameter, the two operands
+ /// element-wise. The result is written to the first operand.
+ ///
+ /// Different reduction algorithms are implemented in different
+ /// runtime functions, all calling 'shuffleReduceFn' to perform
+ /// the essential reduction step. Therefore, based on the 4th
+ /// parameter, this function behaves slightly differently to
+ /// cooperate with the runtime to ensure correctness under
+ /// different circumstances.
+ ///
+ /// 'InterWarpCpyFn' is a pointer to a function that transfers
+ /// reduced variables across warps. It tunnels, through CUDA
+ /// shared memory, the thread-private data of type 'ReduceData'
+ /// from lane 0 of each warp to a lane in the first warp.
+ /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+ /// The last team writes the global reduced value to memory.
+ ///
+ /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
+ /// reduceData, shuffleReduceFn, interWarpCpyFn,
+ /// scratchpadCopyFn, loadAndReduceFn)
+ ///
+ /// 'scratchpadCopyFn' is a helper that stores reduced
+ /// data from the team master to a scratchpad array in
+ /// global memory.
+ ///
+ /// 'loadAndReduceFn' is a helper that loads data from
+ /// the scratchpad array and reduces it with the input
+ /// operand.
+ ///
+ /// These compiler generated functions hide address
+ /// calculation and alignment information from the runtime.
+ /// 5. if ret == 1:
+ /// The team master of the last team stores the reduced
+ /// result to the globals in memory.
+ /// foo += reduceData.foo; bar *= reduceData.bar
+ ///
+ ///
+ /// Warp Reduction Algorithms
+ ///
+ /// On the warp level, we have three algorithms implemented in the
+ /// OpenMP runtime depending on the number of active lanes:
+ ///
+ /// Full Warp Reduction
+ ///
+ /// The reduce algorithm within a warp where all lanes are active
+ /// is implemented in the runtime as follows:
+ ///
+ /// full_warp_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+ /// ShuffleReduceFn(reduce_data, 0, offset, 0);
+ /// }
+ ///
+ /// The algorithm completes in log(2, WARPSIZE) steps.
+ ///
+ /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
+ /// not used therefore we save instructions by not retrieving lane_id
+ /// from the corresponding special registers. The 4th parameter, which
+ /// represents the version of the algorithm being used, is set to 0 to
+ /// signify full warp reduction.
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// #reduce_elem refers to an element in the local lane's data structure
+ /// #remote_elem is retrieved from a remote lane
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
+ ///
+ /// Contiguous Partial Warp Reduction
+ ///
+ /// This reduce algorithm is used within a warp where only the first
+ /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
+ /// number of OpenMP threads in a parallel region is not a multiple of
+ /// WARPSIZE. The algorithm is implemented in the runtime as follows:
+ ///
+ /// void
+ /// contiguous_partial_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
+ /// int size, int lane_id) {
+ /// int curr_size;
+ /// int offset;
+ /// curr_size = size;
+ /// mask = curr_size/2;
+ /// while (offset>0) {
+ /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+ /// curr_size = (curr_size+1)/2;
+ /// offset = curr_size/2;
+ /// }
+ /// }
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// if (lane_id < offset)
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem
+ /// else
+ /// reduce_elem = remote_elem
+ ///
+ /// This algorithm assumes that the data to be reduced are located in a
+ /// contiguous subset of lanes starting from the first. When there is
+ /// an odd number of active lanes, the data in the last lane is not
+ /// aggregated with any other lane's dat but is instead copied over.
+ ///
+ /// Dispersed Partial Warp Reduction
+ ///
+ /// This algorithm is used within a warp when any discontiguous subset of
+ /// lanes are active. It is used to implement the reduction operation
+ /// across lanes in an OpenMP simd region or in a nested parallel region.
+ ///
+ /// void
+ /// dispersed_partial_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ /// int size, remote_id;
+ /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
+ /// do {
+ /// remote_id = next_active_lane_id_right_after_me();
+ /// # the above function returns 0 of no active lane
+ /// # is present right after the current lane.
+ /// size = number_of_active_lanes_in_this_warp();
+ /// logical_lane_id /= 2;
+ /// ShuffleReduceFn(reduce_data, logical_lane_id,
+ /// remote_id-1-threadIdx.x, 2);
+ /// } while (logical_lane_id % 2 == 0 && size > 1);
+ /// }
+ ///
+ /// There is no assumption made about the initial state of the reduction.
+ /// Any number of lanes (>=1) could be active at any position. The reduction
+ /// result is returned in the first active lane.
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// if (lane_id % 2 == 0 && offset > 0)
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem
+ /// else
+ /// reduce_elem = remote_elem
+ ///
+ ///
+ /// Intra-Team Reduction
+ ///
+ /// This function, as implemented in the runtime call
+ /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
+ /// threads in a team. It first reduces within a warp using the
+ /// aforementioned algorithms. We then proceed to gather all such
+ /// reduced values at the first warp.
+ ///
+ /// The runtime makes use of the function 'InterWarpCpyFn', which copies
+ /// data from each of the "warp master" (zeroth lane of each warp, where
+ /// warp-reduced data is held) to the zeroth warp. This step reduces (in
+ /// a mathematical sense) the problem of reduction across warp masters in
+ /// a block to the problem of warp reduction.
+ ///
+ ///
+ /// Inter-Team Reduction
+ ///
+ /// Once a team has reduced its data to a single value, it is stored in
+ /// a global scratchpad array. Since each team has a distinct slot, this
+ /// can be done without locking.
+ ///
+ /// The last team to write to the scratchpad array proceeds to reduce the
+ /// scratchpad array. One or more workers in the last team use the helper
+ /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+ /// the k'th worker reduces every k'th element.
+ ///
+ /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
+ /// reduce across workers and compute a globally reduced value.
+ ///
+ /// \param Loc The location where the reduction was
+ /// encountered. Must be within the associate
+ /// directive and after the last local access to the
+ /// reduction variables.
+ /// \param AllocaIP An insertion point suitable for allocas usable
+ /// in reductions.
+ /// \param AllocaIP An insertion point suitable for code generation.
+ /// \param ReductionInfos A list of info on each reduction variable.
+ /// \param IsNoWait Optional flag set if the reduction is marked as
+ /// nowait.
+ /// \param IsTeamsReduction Optional flag set if it is a teams
+ /// reduction.
+ /// \param HasDistribute Optional flag set if it is a
+ /// distribute reduction.
+ /// \param GridValue Optional GPU grid value.
+ /// \param GridValue Optional GPU grid value.
+ /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
+ /// used for teams reduction.
+ InsertPointTy createReductionsGPU(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait = false, bool IsTeamsReduction = false,
+ bool HasDistribute = false,
+ bool IsSimdReduction = false,
+ ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
+ Value *SrcLocInfo = nullptr);
+
// TODO: provide atomic and non-atomic reduction generators for reduction
// operators defined by the OpenMP specification.
@@ -1421,19 +2015,6 @@ class OpenMPIRBuilder {
Value *NumThreads, Value *HostPtr,
ArrayRef<Value *> KernelArgs);
- /// Generate a barrier runtime call.
- ///
- /// \param Loc The location at which the request originated and is fulfilled.
- /// \param DK The directive which caused the barrier
- /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
- /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
- /// should be checked and acted upon.
- ///
- /// \returns The insertion point after the barrier.
- InsertPointTy emitBarrierImpl(const LocationDescription &Loc,
- omp::Directive DK, bool ForceSimpleCall,
- bool CheckCancelFlag);
-
/// Generate a flush runtime call.
///
/// \param Loc The location at which the request originated and is fulfilled.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index fe09bb8177c28..116b7e98cdd87 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -124,6 +124,9 @@ __OMP_FUNCTION_TYPE(ShuffleReduce, false, Void, VoidPtr, Int16, Int16, Int16)
__OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32)
__OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr)
+__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr)
+__OMP_FUNCTION_TYPE(SimdTask, false, Void, VoidPtrPtr)
+
#undef __OMP_FUNCTION_TYPE
#undef OMP_FUNCTION_TYPE
@@ -204,6 +207,7 @@ __ICV_RT_GET(proc_bind, omp_get_proc_bind)
__OMP_RTL(__kmpc_barrier, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_simd_barrier, false, Void, )
__OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32)
__OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32)
__OMP_RTL(__kmpc_error, false, Void, IdentPtr, Int32, Int8Ptr)
@@ -227,6 +231,7 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_simd_lane, false, Int32, )
__OMP_RTL(omp_get_num_threads, false, Int32, )
__OMP_RTL(omp_get_max_threads, false, Int32, )
__OMP_RTL(omp_in_parallel, false, Int32, )
@@ -480,6 +485,8 @@ __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
+__OMP_RTL(__kmpc_nvptx_simd_reduce_nowait_v2, false, Int32, IdentPtr,
+ Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
__OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr,
Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
__OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr,
@@ -502,6 +509,10 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+__OMP_RTL(__kmpc_simd_4u, false, Void, IdentPtr, LoopTaskPtr, Int32, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd_8u, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd, false, Void, IdentPtr, SimdTaskPtr, VoidPtrPtr, Int32)
+
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
@@ -708,6 +719,7 @@ __OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, ZExt, Par
__OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_simd_lane, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4d2d352f7520b..77f3f863e1e35 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -145,6 +145,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
}
#endif
+Function *GLOBAL_ReductionFunc = nullptr;
+
static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
if (T.isAMDGPU()) {
StringRef Features =
@@ -782,6 +784,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
for (Function *F : ConstantAllocaRaiseCandidates)
raiseUserConstantDataAllocasToEntryBlock(Builder, F);
+ //globalizeVars(Fn);
+
EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
[](EmitMetadataErrorKind Kind,
const TargetRegionEntryInfo &EntryInfo) -> void {
@@ -790,8 +794,167 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
"OMPIRBuilder finalization \n";
};
- if (!OffloadInfoManager.empty())
+ if (!OffloadInfoManager.empty())
createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+ if (Config.EmitLLVMUsed) {
+ std::vector<WeakTrackingVH> LLVMCompilerUsed = {
+ M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
+ emitUsed("llvm.compiler.used", LLVMCompilerUsed);
+ }
+
+}
+
+CallInst * OpenMPIRBuilder::globalizeAlloca(
+ AllocaInst *Alloca,
+ SmallVector<Instruction*, 32> &ToBeDeleted
+) {
+ FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr(
+ OMPRTL___kmpc_alloc_shared
+ );
+
+ Builder.SetInsertPoint(Alloca);
+ Value *SharedAllocArgs[] = {
+ //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8)
+
+ //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout()));
+ //ConstantExpr::getSizeOf(Alloca->getAllocatedType())
+ ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue())
+ };
+
+ CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1));
+ AllocSharedCall->setName(Alloca->getName() + "_on_stack");
+ //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack");
+
+ dbgs() << "Created " << *AllocSharedCall << "\n";
+ dbgs() << *(Alloca->getType()) << "\n";
+ dbgs() << *(AllocSharedCall->getType()) << "\n";
+
+ //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0);
+ //dbgs() << " " << *CastType << "\n";
+ //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast(
+ // AllocSharedCall, CastType, Alloca->getName()+"_on_stack"
+ //);
+
+ //dbgs() << " Casted " << *CastedSharedAlloc << "\n";
+
+ //Alloca->replaceAllUsesWith(AllocSharedCall);
+
+ // If the Alloca was allocated in address space 5 (local) we need to
+ // account for a type mismatch between it and the return from __kmpc_shared_alloc
+
+ for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) {
+ dbgs () << " User - " << *(*U) << "\n";
+ }
+
+ if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) {
+ auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back());
+ dbgs() << *(AddrSpaceCast->getType()) << "\n";
+ AddrSpaceCast->replaceAllUsesWith(AllocSharedCall);
+ //AddrSpaceCast->removeFromParent();
+ ToBeDeleted.push_back(AddrSpaceCast);
+ } else {
+ Alloca->replaceAllUsesWith(AllocSharedCall);
+ }
+ ToBeDeleted.push_back(Alloca);
+ //Alloca->removeFromParent();
+
+ //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) {
+ // if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) {
+ // if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) {
+ // AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc);
+ // AddrSpaceCast->removeFromParent();
+ // }
+ // }
+ //}
+
+ //Alloca->removeFromParent();
+
+ dbgs() << " var globalized!\n";
+
+ return AllocSharedCall;
+
+}
+
+void OpenMPIRBuilder::globalizeParallelVars(
+ llvm::Function *CurFn
+) {
+ SmallVector<Instruction*, 32> ToBeDeleted;
+ std::stack<CallInst*> GlobalizedVars;
+
+ dbgs() << " Exploring: " << CurFn->getName() << "\n";
+ for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+ {
+ for(auto I = BB->begin(); I != BB->end(); I++)
+ {
+ if(auto Alloca = dyn_cast<AllocaInst>(I)) {
+ dbgs() << " Found Alloca: " << *Alloca << "\n";
+ CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted);
+ GlobalizedVars.push(GlobalizedAlloca);
+ } else if(auto FnCall = dyn_cast<CallInst>(I)) {
+ dbgs() << " Found Function Call: " << *FnCall << "\n";
+ }
+ }
+ }
+
+ BasicBlock &EndBlock = CurFn->back();
+ Builder.SetInsertPoint(EndBlock.begin());
+ while(!GlobalizedVars.empty()) {
+ CallInst *SharedAlloc = GlobalizedVars.top();
+ GlobalizedVars.pop();
+ FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr(
+ OMPRTL___kmpc_free_shared
+ );
+
+ Value *SharedFreeArgs[] = {
+ SharedAlloc,
+ SharedAlloc->getArgOperand(0)
+ };
+
+ CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2));
+ dbgs() << " Freed - " << *SharedFreeCall << "\n";
+ }
+
+ for(auto I : ToBeDeleted)
+ I->removeFromParent();
+
+}
+
+// Globalize any variables that are needed in a lower level of
+// the parallel hierarchy.
+// Only Vars used in 'simd' regions are supported right now.
+void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn)
+{
+
+ std::stack<llvm::AllocaInst> Allocas;
+ SmallPtrSet<AllocaInst*, 32> EscapedVars;
+
+ //dbgs() << "Function: " << CurFn->getName() << "\n";
+
+ for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+ {
+ for(auto I = BB->begin(); I != BB->end(); I++)
+ {
+ //dbgs() << " Instruction: " << *I << "\n";
+ if(auto FnCall = dyn_cast<CallInst>(I))
+ {
+ //dbgs() << " Found call: " << *FnCall << "\n";
+ if(auto Fn = FnCall->getCalledFunction()) {
+ //dbgs() << " " << Fn->getName() << "\n";
+ if(Fn->getName() == "__kmpc_parallel_51") {
+ //dbgs() << " Parallel!\n";
+
+ Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5));
+ assert(OutlinedFn && "failed to find GPU parallel outlined fn");
+
+
+ dbgs() << "Found a parallel region\n";
+ globalizeParallelVars(OutlinedFn);
+ }
+ }
+ }
+ }
+ }
}
OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -924,16 +1087,12 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
}
OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
- bool ForceSimpleCall, bool CheckCancelFlag) {
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
+ bool ForceSimpleCall, bool CheckCancelFlag,
+ Value *ThreadID) {
if (!updateToLocation(Loc))
return Loc.IP;
- return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
-}
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
- bool ForceSimpleCall, bool CheckCancelFlag) {
// Build call __kmpc_cancel_barrier(loc, thread_id) or
// __kmpc_barrier(loc, thread_id);
@@ -958,9 +1117,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- Value *Args[] = {
- getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
- getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
+ if (!ThreadID)
+ ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+
+ Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+ ThreadID};
// If we are in a cancellable parallel region, barriers are cancellation
// points.
@@ -1338,6 +1499,467 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
}
}
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+ const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+ LoopBodyCallbackTy BodyGenCB,
+ TripCountCallbackTy DistanceCB,
+ PrivatizeCallbackTy PrivCB,
+ FinalizeCallbackTy FiniCB
+)
+{
+ assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+ BasicBlock *InsertBB = Builder.GetInsertBlock();
+ Function *OuterFn = InsertBB->getParent();
+
+ LLVM_DEBUG(dbgs() << "At the start of createSimdLoop:\n" << *OuterFn << "\n");
+
+ // Save the outer alloca block because the insertion iterator may get
+ // invalidated and we still need this later.
+ BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+ // Vector to remember instructions we used only during the modeling but which
+ // we want to delete at the end.
+ SmallVector<Instruction *, 16> ToBeDeleted;
+
+ // Create an artificial insertion point that will also ensure the blocks we
+ // are about to split are not degenerated.
+ auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+ Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+ BasicBlock *ThenBB = ThenTI->getParent();
+
+ // Alloca block for simd
+ BasicBlock *EntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.simd.entry");
+
+ // Block for setup related to simd
+ // i.e variable privatizaiton, trip count, reductions
+ BasicBlock *PrologBB = EntryBB->splitBasicBlock(ThenTI, "omp.simd.prolog");
+
+ // Entry block for the outlined loop body
+ // Allocas from the loop body should be done here
+ BasicBlock *LoopEntryBB = PrologBB->splitBasicBlock(ThenTI, "omp.simd.loop.entry");
+
+ // Block for generating the loop body
+ BasicBlock *LoopBodyBB = LoopEntryBB->splitBasicBlock(ThenTI, "omp.simd.loop.body");
+
+ BasicBlock *LoopPreFiniBB =
+ LoopBodyBB->splitBasicBlock(ThenTI, "omp.simd.loop.pre_finalize");
+
+ BasicBlock *LoopExitBB =
+ LoopPreFiniBB->splitBasicBlock(ThenTI, "omp.simd.loop.outlined.exit");
+
+ // Block for finalizing any reductions
+ BasicBlock *ReductionEpilogBB =
+ LoopExitBB->splitBasicBlock(ThenTI, "omp.reduction.epilog");
+
+ BasicBlock *FinalizeBB =
+ ReductionEpilogBB->splitBasicBlock(ThenTI, "omp.simd.finalize");
+
+ auto FiniCBWrapper = [&](InsertPointTy IP) {
+ // Hide "open-ended" blocks from the given FiniCB by setting the right jump
+ // target to the region exit blocks
+ if (IP.getBlock()->end() == IP.getPoint()) {
+ IRBuilder<>::InsertPointGuard IPG(Builder);
+ Builder.restoreIP(IP);
+ Instruction *I = Builder.CreateBr(FinalizeBB); //PRegExitBB);
+ IP = InsertPointTy(I->getParent(), I->getIterator());
+ }
+ assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
+ IP.getBlock()->getTerminator()->getSuccessor(0) == FinalizeBB && //PRegExitBB &&
+ "Unexpected insertion point for finalization call!");
+ return FiniCB(IP);
+ };
+
+ FinalizationStack.push_back({FiniCBWrapper, OMPD_simd, false});
+
+ // Compute the loop trip count
+ // Insert after the outer alloca to ensure all variables needed
+ // in its calculation are ready
+
+ InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
+ assert(DistanceCB && "expected loop trip count callback function!");
+ Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+ assert(DistVal && "trip count call back should return integer trip count");
+ Type *DistValType = DistVal->getType();
+ assert(DistValType->isIntegerTy() && "trip count should be integer type");
+
+ LLVM_DEBUG(dbgs() << "After DistanceCB:\n" << *PrologBB << "\n");
+ LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n");
+
+ // Create the virtual iteration variable that will be pulled into
+ // the outlined function.
+ //Builder.restoreIP(OuterAllocaIP);
+ Builder.SetInsertPoint(EntryBB, EntryBB->begin());
+ AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp");
+ Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv");
+ //InsertPointTy MidAllocaIP = Builder.saveIP();
+
+ // Generate the privatization allocas in the block that will become the entry
+ // of the outlined function.
+// Builder.SetInsertPoint(LoopEntryBB->getTerminator());
+ Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin());
+ // Use omp.iv in the outlined region so it gets captured during the outline
+ Instruction *OMPIVUse = dyn_cast<Instruction>(
+ Builder.CreateAdd(OMPIV, OMPIV, "omp.iv.tobedeleted"));
+ InsertPointTy InnerAllocaIP = Builder.saveIP();
+
+ // All of the temporary omp.iv variables need to be deleted later
+ // Order matters
+ ToBeDeleted.push_back(OMPIVUse);
+ ToBeDeleted.push_back(OMPIV);
+ ToBeDeleted.push_back(OMPIVAlloca);
+
+ LLVM_DEBUG(dbgs() << "omp.iv variable generated:\n" << *OuterFn << "\n");
+
+ LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n");
+ assert(BodyGenCB && "Expected body generation callback!");
+ InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin());
+
+ InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator());
+ InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin());
+
+ // Generate the body of the loop. The omp.iv variable is a value between
+ // 0 <= omp.iv < TripCount
+ // If a loop variable is needed, then this callback function can initialize
+ // it based on the omp.iv.
+ BodyGenCB(EntryBB, InnerAllocaIP, CodeGenIP, PrologIP, ReductionEpilogIP, OMPIV);
+
+ LLVM_DEBUG(dbgs() << "After body codegen:\n" << *OuterFn << "\n");
+
+ // Determine what runtime function should be called based on the type
+ // of the trip count
+ //FunctionCallee RTLFn;
+
+ // Outline 1
+ {
+ OutlineInfo OI;
+
+ // Adjust the finalization stack, verify the adjustment, and call the
+ // finalize function a last time to finalize values between the pre-fini
+ // block and the exit block if we left the parallel "the normal way".
+ //auto FiniInfo = FinalizationStack.pop_back_val();
+ //(void)FiniInfo;
+ //assert(FiniInfo.DK == OMPD_simd &&
+ // "Unexpected finalization stack state!");
+
+ Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator();
+
+ InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator());
+ FiniCB(PreFiniIP);
+
+ OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock;
+ OI.EntryBB = LoopEntryBB;
+ OI.ExitBB = LoopExitBB;
+
+ SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+ SmallVector<BasicBlock *, 32> Blocks;
+ OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+ // Ensure a single exit node for the outlined region by creating one.
+ // We might have multiple incoming edges to the exit now due to finalizations,
+ // e.g., cancel calls that cause the control flow to leave the region.
+ //BasicBlock *PRegOutlinedExitBB = PRegExitBB;
+ //PRegExitBB = LRegExitBB;
+ //PRegOutlinedExitBB->setName("omp.loop.outlined.exit");
+
+ Blocks.push_back(LoopExitBB);
+
+ CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+ CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+ /* AggregateArgs */ true,
+ /* BlockFrequencyInfo */ nullptr,
+ /* BranchProbabilityInfo */ nullptr,
+ /* AssumptionCache */ nullptr,
+ /* AllowVarArgs */ false,
+ /* AllowAlloca */ true,
+ /* AllocationBlock */ EntryBB, //OuterAllocaBlock,
+ /* Suffix */ ".omp_simd");
+
+ BasicBlock *CommonExit = nullptr;
+ SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+ Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+ Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+ auto PrivHelper = [&](Value &V) {
+ // Exclude omp.iv from aggregate
+ if (&V == OMPIV) {
+ OI.ExcludeArgsFromAggregate.push_back(&V);
+ return;
+ }
+
+ // Get all uses of value that are inside of the outlined region
+ SetVector<Use *> Uses;
+ for (Use &U : V.uses())
+ if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+ if (ParallelRegionBlockSet.count(UserI->getParent()))
+ Uses.insert(&U);
+
+ Value *Inner = &V;
+
+ // If the value isn't a pointer type, store it in a pointer
+ // Unpack it inside the outlined region
+ if (!V.getType()->isPointerTy()) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+ Builder.restoreIP(OuterAllocaIP);
+ Value *Ptr = Builder.CreateAlloca(
+ V.getType(), nullptr, V.getName() + ".reloaded");
+
+ // Store to stack at end of the block that currently branches to the entry
+ // block of the to-be-outlined region.
+ Builder.SetInsertPoint(
+ InsertBB, InsertBB->getTerminator()->getIterator());
+ Builder.CreateStore(&V, Ptr);
+
+ // Load back next to allocations in the to-be-outlined region.
+ Builder.restoreIP(InnerAllocaIP);
+ Inner = Builder.CreateLoad(V.getType(), Ptr);
+ }
+
+ Value *ReplacementValue = nullptr;
+ Builder.restoreIP(
+ PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+ assert(ReplacementValue &&
+ "Expected copy/create callback to set replacement value!");
+ if (ReplacementValue == &V)
+ return;
+
+ for (Use *UPtr : Uses)
+ UPtr->set(ReplacementValue);
+
+ };
+
+ LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+ InnerAllocaIP = IRBuilder<>::InsertPoint(
+ OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+ // Reset the outer alloca insertion point to the entry of the relevant block
+ // in case it was invalidated.
+ OuterAllocaIP = IRBuilder<>::InsertPoint(
+ OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+ for (Value *Input : Inputs) {
+ PrivHelper(*Input);
+ }
+
+ assert(Outputs.empty() &&
+ "OpenMP outlining should not produce live-out values!");
+
+ LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
+ for (auto *BB : Blocks) {
+ LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+ }
+
+ int NumInputs = Inputs.size()-1; // One argument is always omp.iv
+ OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+ OutlinedFn.addFnAttr(Attribute::NoUnwind);
+ OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+ assert(OutlinedFn.arg_size() == 2 &&
+ "Expected omp.iv & structArg as arguments");
+
+ CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+ BasicBlock *CallBlock = CI->getParent();
+ CallBlock->setName("omp_loop");
+ Builder.SetInsertPoint(CI);
+
+ Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv
+
+ Value *SimdArgs[] = {
+ Ident,
+ Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr),
+ DistVal,
+ Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr)};
+
+ SmallVector<Value *, 16> RealArgs;
+ RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+ FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+ (DistValType->isIntegerTy(32) ? OMPRTL___kmpc_simd_4u :
+ OMPRTL___kmpc_simd_8u));
+ Builder.CreateCall(RTLFn, RealArgs);
+
+ LLVM_DEBUG(dbgs() << "With kmpc_simd_4u call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+ CI->eraseFromParent();
+
+ for (Instruction *I : ToBeDeleted)
+ I->eraseFromParent();
+
+ };
+
+ addOutlineInfo(std::move(OI));
+ }
+
+
+// Outline 2
+ if(false) { // if(!SPMD) {
+ OutlineInfo OI;
+
+ OI.OuterAllocaBB = OuterAllocaBlock;
+ OI.EntryBB = EntryBB; //LoopEntryBB;
+ OI.ExitBB = FinalizeBB; //LoopExitBB;
+
+ SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+ SmallVector<BasicBlock *, 32> Blocks;
+ OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+ CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+ CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+ /* AggregateArgs */ true,
+ /* BlockFrequencyInfo */ nullptr,
+ /* BranchProbabilityInfo */ nullptr,
+ /* AssumptionCache */ nullptr,
+ /* AllowVarArgs */ false,
+ /* AllowAlloca */ true,
+ /* AllocationBlock */ OuterAllocaBlock,
+ /* Suffix */ ".omp_simd");
+
+ BasicBlock *CommonExit = nullptr;
+ SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+ Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+ Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+ auto PrivHelper = [&](Value &V) {
+ // Exclude omp.iv from aggregate
+ //if (&V == OMPIV) {
+ // OI.ExcludeArgsFromAggregate.push_back(&V);
+ // return;
+ //}
+
+ // Get all uses of value that are inside of the outlined region
+ SetVector<Use *> Uses;
+ for (Use &U : V.uses())
+ if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+ if (ParallelRegionBlockSet.count(UserI->getParent()))
+ Uses.insert(&U);
+
+ Value *Inner = &V;
+
+ // If the value isn't a pointer type, store it in a pointer
+ // Unpack it inside the outlined region
+ if (!V.getType()->isPointerTy()) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+ Builder.restoreIP(OuterAllocaIP);
+ Value *Ptr = Builder.CreateAlloca(
+ V.getType(), nullptr, V.getName() + ".reloaded");
+
+ // Store to stack at end of the block that currently branches to the entry
+ // block of the to-be-outlined region.
+ Builder.SetInsertPoint(
+ InsertBB, InsertBB->getTerminator()->getIterator());
+ Builder.CreateStore(&V, Ptr);
+
+ // Load back next to allocations in the to-be-outlined region.
+ Builder.restoreIP(InnerAllocaIP);
+ Inner = Builder.CreateLoad(V.getType(), Ptr);
+ }
+
+ Value *ReplacementValue = nullptr;
+ Builder.restoreIP(
+ PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+ assert(ReplacementValue &&
+ "Expected copy/create callback to set replacement value!");
+ if (ReplacementValue == &V)
+ return;
+
+ for (Use *UPtr : Uses)
+ UPtr->set(ReplacementValue);
+
+ };
+
+ LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+ InnerAllocaIP = IRBuilder<>::InsertPoint(
+ OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+ // Reset the outer alloca insertion point to the entry of the relevant block
+ // in case it was invalidated.
+ OuterAllocaIP = IRBuilder<>::InsertPoint(
+ OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+ for (Value *Input : Inputs) {
+ PrivHelper(*Input);
+ }
+
+ assert(Outputs.empty() &&
+ "OpenMP outlining should not produce live-out values!");
+
+ LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
+ for (auto *BB : Blocks) {
+ LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+ }
+
+ int NumInputs = Inputs.size();
+
+ OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+ OutlinedFn.addFnAttr(Attribute::NoUnwind);
+ OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+ assert(OutlinedFn.arg_size() == 1 &&
+ "Expected structArg as arguments");
+
+ CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+ BasicBlock *CallBlock = CI->getParent();
+ CallBlock->setName("omp_simd");
+ Builder.SetInsertPoint(CI);
+
+ Value * StructArg = CI->getArgOperand(0);
+
+ Value *SimdArgs[] = {
+ Ident,
+ Builder.CreateBitCast(&OutlinedFn, SimdTaskPtr),
+ Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr),
+ Builder.getInt32(NumInputs)};
+
+ SmallVector<Value *, 16> RealArgs;
+ RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+ FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+ OMPRTL___kmpc_simd);
+ Builder.CreateCall(RTLFn, RealArgs);
+
+ LLVM_DEBUG(dbgs() << "With __kmpc_simd call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+ CI->eraseFromParent();
+
+ for (Instruction *I : ToBeDeleted)
+ I->eraseFromParent();
+
+ };
+
+ addOutlineInfo(std::move(OI));
+ }
+
+
+
+
+
+ InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end());
+ UI->eraseFromParent();
+
+ return AfterIP;
+
+}
+
+
IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
@@ -1634,7 +2256,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
LLVM_DEBUG({
for (auto *BB : Blocks)
- dbgs() << " PBR: " << BB->getName() << "\n";
+ LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
});
// Register the outlined info.
@@ -2096,18 +2718,1463 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
/*IsCancellable*/ true);
}
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-Function *getFreshReductionFunc(Module &M) {
+static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
+ BasicBlock::iterator IT(I);
+ IT++;
+ return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
+}
+
+void OpenMPIRBuilder::emitUsed(StringRef Name,
+ std::vector<WeakTrackingVH> &List) {
+ if (List.empty())
+ return;
+
+ // Convert List to what ConstantArray needs.
+ SmallVector<Constant *, 8> UsedArray;
+ UsedArray.resize(List.size());
+ for (unsigned I = 0, E = List.size(); I != E; ++I)
+ UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+ if (UsedArray.empty())
+ return;
+ ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+ auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, UsedArray), Name);
+
+ GV->setSection("llvm.metadata");
+}
+
+Value *OpenMPIRBuilder::getGPUThreadID() {
+ return Builder.CreateCall(
+ getOrCreateRuntimeFunction(M,
+ OMPRTL___kmpc_get_hardware_thread_id_in_block),
+ {});
+}
+
+Value *OpenMPIRBuilder::getGPUWarpSize() {
+ return Builder.CreateCall(
+ getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
+}
+
+Value *OpenMPIRBuilder::getNVPTXWarpID() {
+ unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+ return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
+}
+
+Value *OpenMPIRBuilder::getNVPTXLaneID() {
+ unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+ assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
+ unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
+ return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
+ "nvptx_lane_id");
+}
+
+Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
+ Type *ToType) {
+ Type *FromType = From->getType();
+ uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
+ uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
+ assert(FromSize > 0 && "From size must be greater than zero");
+ assert(ToSize > 0 && "To size must be greater than zero");
+ if (FromType == ToType)
+ return From;
+ if (FromSize == ToSize)
+ return Builder.CreateBitCast(From, ToType);
+ if (ToType->isIntegerTy() && FromType->isIntegerTy())
+ return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
+ InsertPointTy SaveIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ Value *CastItem = Builder.CreateAlloca(ToType);
+ Builder.restoreIP(SaveIP);
+
+ Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ CastItem, FromType->getPointerTo());
+ Builder.CreateStore(From, ValCastItem);
+ return Builder.CreateLoad(ToType, CastItem);
+}
+
+Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
+ Value *Element,
+ Type *ElementType,
+ Value *Offset) {
+ uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
+ assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
+
+ // Cast all types to 32- or 64-bit values before calling shuffle routines.
+ Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+ Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
+ Value *WarpSize =
+ Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
+ Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
+ Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
+ : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
+ Value *WarpSizeCast =
+ Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
+ Value *ShuffleCall =
+ Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
+ return castValueToType(AllocaIP, ShuffleCall, CastTy);
+}
+
+void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
+ Value *DstAddr, Type *ElemType,
+ Value *Offset, Type *ReductionArrayTy) {
+ uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
+ // Create the loop over the big sized data.
+ // ptr = (void*)Elem;
+ // ptrEnd = (void*) Elem + 1;
+ // Step = 8;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int64_t)*ptr);
+ // Step = 4;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int32_t)*ptr);
+ // ...
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ Value *ElemPtr = DstAddr;
+ Value *Ptr = SrcAddr;
+ for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
+ if (Size < IntSize)
+ continue;
+ Type *IntType = Builder.getIntNTy(IntSize * 8);
+ Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+ Value *SrcAddrGEP =
+ Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
+ ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+
+ Function *CurFunc = Builder.GetInsertBlock()->getParent();
+ if ((Size / IntSize) > 1) {
+ Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ SrcAddrGEP, Builder.getPtrTy());
+ BasicBlock *PreCondBB =
+ BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
+ BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
+ BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
+ BasicBlock *CurrentBB = Builder.GetInsertBlock();
+ emitBlock(PreCondBB, CurFunc);
+ PHINode *PhiSrc =
+ Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
+ PhiSrc->addIncoming(Ptr, CurrentBB);
+ PHINode *PhiDest =
+ Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
+ PhiDest->addIncoming(ElemPtr, CurrentBB);
+ Ptr = PhiSrc;
+ ElemPtr = PhiDest;
+ Value *PtrDiff = Builder.CreatePtrDiff(
+ Builder.getInt8Ty(), PtrEnd,
+ Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
+ Builder.CreateCondBr(
+ Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
+ ExitBB);
+ emitBlock(ThenBB, CurFunc);
+ Value *Res = createRuntimeShuffleFunction(
+ AllocaIP,
+ Builder.CreateAlignedLoad(
+ IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
+ IntType, Offset);
+ Builder.CreateAlignedStore(Res, ElemPtr,
+ M.getDataLayout().getPrefTypeAlign(ElemType));
+ Value *LocalPtr =
+ Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+ Value *LocalElemPtr =
+ Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+ PhiSrc->addIncoming(LocalPtr, ThenBB);
+ PhiDest->addIncoming(LocalElemPtr, ThenBB);
+ emitBranch(PreCondBB);
+ emitBlock(ExitBB, CurFunc);
+ } else {
+ Value *Res = createRuntimeShuffleFunction(
+ AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
+ if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
+ Res->getType()->getScalarSizeInBits())
+ Res = Builder.CreateTrunc(Res, ElemType);
+ Builder.CreateStore(Res, ElemPtr);
+ Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+ ElemPtr =
+ Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+ }
+ Size = Size % IntSize;
+ }
+}
+
+void OpenMPIRBuilder::emitReductionListCopy(
+ InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+ ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+ CopyOptionsTy CopyOptions) {
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+
+ // Iterates, element-by-element, through the source Reduce list and
+ // make a copy.
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *SrcElementAddr = nullptr;
+ Value *DestElementAddr = nullptr;
+ Value *DestElementPtrAddr = nullptr;
+ // Should we shuffle in an element from a remote lane?
+ bool ShuffleInElement = false;
+ // Set to true to update the pointer in the dest Reduce list to a
+ // newly created element.
+ bool UpdateDestListPtr = false;
+
+ // Step 1.1: Get the address for the src element in the Reduce list.
+ Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
+ ReductionArrayTy, SrcBase,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
+
+ // Step 1.2: Create a temporary to store the element in the destination
+ // Reduce list.
+ DestElementPtrAddr = Builder.CreateInBoundsGEP(
+ ReductionArrayTy, DestBase,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ switch (Action) {
+ case CopyAction::RemoteLaneToThread: {
+ InsertPointTy CurIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
+ ".omp.reduction.element");
+ DestAlloca->setAlignment(
+ M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+ DestElementAddr = DestAlloca;
+ DestElementAddr =
+ Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
+ DestElementAddr->getName() + ".ascast");
+ Builder.restoreIP(CurIP);
+ ShuffleInElement = true;
+ UpdateDestListPtr = true;
+ break;
+ }
+ case CopyAction::ThreadCopy: {
+ DestElementAddr =
+ Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
+ break;
+ }
+ }
+
+ // Now that all active lanes have read the element in the
+ // Reduce list, shuffle over the value from the remote lane.
+ if (ShuffleInElement) {
+ shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
+ RemoteLaneOffset, ReductionArrayTy);
+ } else {
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+ // Store the source element value to the dest element address.
+ Builder.CreateStore(Elem, DestElementAddr);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, SrcElementAddr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, DestElementAddr, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, DestElementAddr, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
+ break;
+ }
+ case EvaluationKindTy::Aggregate: {
+ Value *SizeVal = Builder.getInt64(
+ M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SizeVal, false);
+ break;
+ }
+ };
+ }
+
+ // Step 3.1: Modify reference in dest Reduce list as needed.
+ // Modifying the reference in Reduce list to point to the newly
+ // created element. The element is live in the current function
+ // scope and that of functions it invokes (i.e., reduce_function).
+ // RemoteReduceData[i] = (void*)&RemoteElem
+ if (UpdateDestListPtr) {
+ Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ DestElementAddr, Builder.getPtrTy(),
+ DestElementAddr->getName() + ".ascast");
+ Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
+ }
+ }
+}
+
+Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
+ const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
+ AttributeList FuncAttrs) {
+ InsertPointTy SavedIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
+ /* IsVarArg */ false);
+ Function *WcFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_inter_warp_copy_func", &M);
+ WcFunc->setAttributes(FuncAttrs);
+ WcFunc->addParamAttr(0, Attribute::NoUndef);
+ WcFunc->addParamAttr(1, Attribute::NoUndef);
+ BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
+ Builder.SetInsertPoint(EntryBB);
+
+ // ReduceList: thread local Reduce list.
+ // At the stage of the computation when this function is called, partially
+ // aggregated values reside in the first lane of every active warp.
+ Argument *ReduceListArg = WcFunc->getArg(0);
+ // NumWarps: number of warps active in the parallel region. This could
+ // be smaller than 32 (max warps in a CTA) for partial block reduction.
+ Argument *NumWarpsArg = WcFunc->getArg(1);
+
+ // This array is used as a medium to transfer, one reduce element at a time,
+ // the data from the first lane of every warp to lanes in the first warp
+ // in order to perform the final step of a reduction in a parallel region
+ // (reduction across warps). The array is placed in NVPTX __shared__ memory
+ // for reduced latency, as well as to have a distinct copy for concurrently
+ // executing target regions. The array is declared with common linkage so
+ // as to be shared across compilation units.
+ StringRef TransferMediumName =
+ "__openmp_nvptx_data_transfer_temporary_storage";
+ GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+ unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
+ ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
+ if (!TransferMedium) {
+ TransferMedium = new GlobalVariable(
+ M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
+ UndefValue::get(ArrayTy), TransferMediumName,
+ /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+ /*AddressSpace=*/3);
+ }
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+
+ // Get the CUDA thread id of the current OpenMP thread on the GPU.
+ Value *GPUThreadID = getGPUThreadID();
+ // nvptx_lane_id = nvptx_id % warpsize
+ Value *LaneID = getNVPTXLaneID();
+ // nvptx_warp_id = nvptx_id / warpsize
+ Value *WarpID = getNVPTXWarpID();
+
+ InsertPointTy AllocaIP =
+ InsertPointTy(Builder.GetInsertBlock(),
+ Builder.GetInsertBlock()->getFirstInsertionPt());
+ Type *Arg0Type = ReduceListArg->getType();
+ Type *Arg1Type = NumWarpsArg->getType();
+ Builder.restoreIP(AllocaIP);
+ AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
+ Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
+ AllocaInst *NumWarpsAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
+ Value *ThreadID =
+ getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+ Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
+ Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ NumWarpsAlloca, Arg1Type->getPointerTo(),
+ NumWarpsAlloca->getName() + ".ascast");
+ Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+ Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
+ AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
+ InsertPointTy CodeGenIP =
+ getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
+ Builder.restoreIP(CodeGenIP);
+
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ //
+ // Warp master copies reduce element to transfer medium in __shared__
+ // memory.
+ //
+ const ReductionInfo &RI = En.value();
+ unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
+ for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+ Type *CType = Builder.getIntNTy(TySize * 8);
+
+ unsigned NumIters = RealTySize / TySize;
+ if (NumIters == 0)
+ continue;
+ Value *Cnt = nullptr;
+ Value *CntAddr = nullptr;
+ BasicBlock *PrecondBB = nullptr;
+ BasicBlock *ExitBB = nullptr;
+ if (NumIters > 1) {
+ CodeGenIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ CntAddr =
+ Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
+
+ CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
+ CntAddr->getName() + ".ascast");
+ Builder.restoreIP(CodeGenIP);
+ Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
+ CntAddr,
+ /*Volatile=*/false);
+ PrecondBB = BasicBlock::Create(Ctx, "precond");
+ ExitBB = BasicBlock::Create(Ctx, "exit");
+ BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
+ emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
+ Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
+ /*Volatile=*/false);
+ Value *Cmp = Builder.CreateICmpULT(
+ Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
+ Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
+ emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
+ }
+
+ // kmpc_barrier.
+ createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true, ThreadID);
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+ // if (lane_id == 0)
+ Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
+ Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+ emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+
+ // Reduce element = LocalReduceList[i]
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ Value *ElemPtrPtr =
+ Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+ {ConstantInt::get(IndexTy, 0),
+ ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+ if (NumIters > 1)
+ ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
+
+ // Get pointer to location in transfer medium.
+ // MediumPtr = &medium[warp_id]
+ Value *MediumPtr = Builder.CreateInBoundsGEP(
+ ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
+ // elem = *elemptr
+ //*MediumPtr = elem
+ Value *Elem = Builder.CreateLoad(CType, ElemPtr);
+ // Store the source element value to the dest element address.
+ Builder.CreateStore(Elem, MediumPtr,
+ /*IsVolatile*/ true);
+ Builder.CreateBr(MergeBB);
+
+ // else
+ emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(MergeBB);
+
+ // endif
+ emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+ createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true, ThreadID);
+
+ // Warp 0 copies reduce element from transfer medium
+ BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+ Value *NumWarpsVal =
+ Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
+ // Up to 32 threads in warp 0 are active.
+ Value *IsActiveThread =
+ Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
+ Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+ emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
+
+ // SecMediumPtr = &medium[tid]
+ // SrcMediumVal = *SrcMediumPtr
+ Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+ ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
+ // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+ Value *TargetElemPtrPtr =
+ Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+ {ConstantInt::get(IndexTy, 0),
+ ConstantInt::get(IndexTy, En.index())});
+ Value *TargetElemPtrVal =
+ Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
+ Value *TargetElemPtr = TargetElemPtrVal;
+ if (NumIters > 1)
+ TargetElemPtr =
+ Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
+
+ // *TargetElemPtr = SrcMediumVal;
+ Value *SrcMediumValue =
+ Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
+ Builder.CreateStore(SrcMediumValue, TargetElemPtr);
+ Builder.CreateBr(W0MergeBB);
+
+ emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(W0MergeBB);
+
+ emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
+
+ if (NumIters > 1) {
+ Cnt = Builder.CreateNSWAdd(
+ Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
+ Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
+
+ auto *CurFn = Builder.GetInsertBlock()->getParent();
+ emitBranch(PrecondBB);
+ emitBlock(ExitBB, CurFn);
+ }
+ RealTySize %= TySize;
+ }
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(SavedIP);
+
+ return WcFunc;
+}
+
+Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ AttributeList FuncAttrs) {
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy =
+ FunctionType::get(Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt16Ty(),
+ Builder.getInt16Ty(), Builder.getInt16Ty()},
+ /* IsVarArg */ false);
+ Function *SarFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_shuffle_and_reduce_func", &M);
+ SarFunc->setAttributes(FuncAttrs);
+ SarFunc->addParamAttr(0, Attribute::NoUndef);
+ SarFunc->addParamAttr(1, Attribute::NoUndef);
+ SarFunc->addParamAttr(2, Attribute::NoUndef);
+ SarFunc->addParamAttr(3, Attribute::NoUndef);
+ SarFunc->addParamAttr(1, Attribute::SExt);
+ SarFunc->addParamAttr(2, Attribute::SExt);
+ SarFunc->addParamAttr(3, Attribute::SExt);
+ BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
+ Builder.SetInsertPoint(EntryBB);
+
+ // Thread local Reduce list used to host the values of data to be reduced.
+ Argument *ReduceListArg = SarFunc->getArg(0);
+ // Current lane id; could be logical.
+ Argument *LaneIDArg = SarFunc->getArg(1);
+ // Offset of the remote source lane relative to the current lane.
+ Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
+ // Algorithm version. This is expected to be known at compile time.
+ Argument *AlgoVerArg = SarFunc->getArg(3);
+
+ Type *ReduceListArgType = ReduceListArg->getType();
+ Type *LaneIDArgType = LaneIDArg->getType();
+ Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+ Value *ReduceListAlloca = Builder.CreateAlloca(
+ ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
+ Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+ LaneIDArg->getName() + ".addr");
+ Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
+ LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
+ Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+ AlgoVerArg->getName() + ".addr");
+ ArrayType *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // Create a local thread-private variable to host the Reduce list
+ // from a remote lane.
+ Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+ RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+ Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListAlloca, ReduceListArgType,
+ ReduceListAlloca->getName() + ".ascast");
+ Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
+ Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteLaneOffsetAlloca, LaneIDArgPtrType,
+ RemoteLaneOffsetAlloca->getName() + ".ascast");
+ Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
+ Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteReductionListAlloca, Builder.getPtrTy(),
+ RemoteReductionListAlloca->getName() + ".ascast");
+
+ Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+ Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
+ Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
+ Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
+
+ Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
+ Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
+ Value *RemoteLaneOffset =
+ Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
+ Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
+
+ InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
+
+ // This loop iterates through the list of reduce elements and copies,
+ // element by element, from a remote lane in the warp to RemoteReduceList,
+ // hosted on the thread's stack.
+ emitReductionListCopy(
+ AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+ ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+
+ // The actions to be performed on the Remote Reduce list is dependent
+ // on the algorithm version.
+ //
+ // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+ // LaneId % 2 == 0 && Offset > 0):
+ // do the reduction value aggregation
+ //
+ // The thread local variable Reduce list is mutated in place to host the
+ // reduced data, which is the aggregated value produced from local and
+ // remote lanes.
+ //
+ // Note that AlgoVer is expected to be a constant integer known at compile
+ // time.
+ // When AlgoVer==0, the first conjunction evaluates to true, making
+ // the entire predicate true during compile time.
+ // When AlgoVer==1, the second conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ // When AlgoVer==2, the third conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+ Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+ Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+ Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+ Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+ Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+ Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+ Value *RemoteOffsetComp =
+ Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+ Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+ Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+ Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+ Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+ emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+ // reduce_function(LocalReduceList, RemoteReduceList)
+ Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceList, Builder.getPtrTy());
+ Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteListAddrCast, Builder.getPtrTy());
+ Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
+ ->addFnAttr(Attribute::NoUnwind);
+ Builder.CreateBr(MergeBB);
+
+ emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(MergeBB);
+
+ emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+
+ // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
+ // Reduce list.
+ Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+ Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
+
+ BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
+ Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+ emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
+ emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
+ ReductionInfos, RemoteListAddrCast, ReduceList);
+ Builder.CreateBr(CpyMergeBB);
+
+ emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(CpyMergeBB);
+
+ emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
+
+ Builder.CreateRetVoid();
+
+ return SarFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs) {
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGCFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_list_to_global_copy_func", &M);
+ LtGCFunc->setAttributes(FuncAttrs);
+ LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGCFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGCFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *LocalReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Value *BufferArgVal =
+ Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ // Reduce element = LocalReduceList[i]
+ Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+ // Global = Buffer.VD[Idx];
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
+ Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
+ Builder.CreateStore(TargetElement, GlobVal);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobVal, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobVal, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
+ break;
+ }
+ case EvaluationKindTy::Aggregate: {
+ Value *SizeVal =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
+ M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
+ break;
+ }
+ }
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGRFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_list_to_global_reduce_func", &M);
+ LtGRFunc->setAttributes(FuncAttrs);
+ LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGRFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGRFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+ Value *LocalReduceList =
+ Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, Builder.getPtrTy(),
+ LocalReduceList->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceListAddrCast,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+ // Global = Buffer.VD[Idx];
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+ Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+ }
+
+ // Call reduce_function(GlobalReduceList, ReduceList)
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
+ ->addFnAttr(Attribute::NoUnwind);
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGRFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs) {
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGCFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_global_to_list_copy_func", &M);
+ LtGCFunc->setAttributes(FuncAttrs);
+ LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGCFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGCFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *LocalReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ // Reduce element = LocalReduceList[i]
+ Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+ // Global = Buffer.VD[Idx];
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+ Builder.CreateStore(TargetElement, ElemPtr);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobValPtr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobValPtr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
+ break;
+ }
+ case EvaluationKindTy::Aggregate: {
+ Value *SizeVal =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SizeVal, false);
+ break;
+ }
+ }
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ auto *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGRFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_global_to_list_reduce_func", &M);
+ LtGRFunc->setAttributes(FuncAttrs);
+ LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGRFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGRFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ ArrayType *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+ Value *LocalReduceList =
+ Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, Builder.getPtrTy(),
+ LocalReduceList->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, ReductionList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // Global = Buffer.VD[Idx];
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+ Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+ }
+
+ // Call reduce_function(ReduceList, GlobalReduceList)
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
+ ->addFnAttr(Attribute::NoUnwind);
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGRFunc;
+}
+
+std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
+ std::string Suffix =
+ createPlatformSpecificName({"omp", "reduction", "reduction_func"});
+ return (Name + Suffix).str();
+}
+
+Function *OpenMPIRBuilder::createReductionFunction(
+ StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+ ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
+ auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ std::string Name = getReductionFuncName(ReducerName);
+ Function *ReductionFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
+ ReductionFunc->setAttributes(FuncAttrs);
+ ReductionFunc->addParamAttr(0, Attribute::NoUndef);
+ ReductionFunc->addParamAttr(1, Attribute::NoUndef);
+ BasicBlock *EntryBB =
+ BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
+ Builder.SetInsertPoint(EntryBB);
+
+ // Need to alloca memory here and deal with the pointers before getting
+ // LHS/RHS pointers out
+ Value *LHSArrayPtr = nullptr;
+ Value *RHSArrayPtr = nullptr;
+ Argument *Arg0 = ReductionFunc->getArg(0);
+ Argument *Arg1 = ReductionFunc->getArg(1);
+ Type *Arg0Type = Arg0->getType();
+ Type *Arg1Type = Arg1->getType();
+
+ Value *LHSAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Value *RHSAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+ Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
+ Builder.CreateStore(Arg0, LHSAddrCast);
+ Builder.CreateStore(Arg1, RHSAddrCast);
+ LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+ RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ SmallVector<Value *> LHSPtrs, RHSPtrs;
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, RHSArrayPtr,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType(),
+ RHSI8Ptr->getName() + ".ascast");
+
+ Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, LHSArrayPtr,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ LHSPtrs.emplace_back(LHSPtr);
+ RHSPtrs.emplace_back(RHSPtr);
+ } else {
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+ if (!Builder.GetInsertBlock())
+ return ReductionFunc;
+ Builder.CreateStore(Reduced, LHSPtr);
+ }
+ }
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang)
+ for (auto En : enumerate(ReductionInfos)) {
+ unsigned Index = En.index();
+ const ReductionInfo &RI = En.value();
+ Value *LHSFixupPtr, *RHSFixupPtr;
+ Builder.restoreIP(RI.ReductionGenClang(
+ Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
+
+ // Fix the CallBack code genereated to use the correct Values for the LHS
+ // and RHS
+ LHSFixupPtr->replaceUsesWithIf(
+ LHSPtrs[Index], [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ RHSFixupPtr->replaceUsesWithIf(
+ RHSPtrs[Index], [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ }
+
+ Builder.CreateRetVoid();
+ return ReductionFunc;
+}
+
+static void
+checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ bool IsGPU) {
+ for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
+ (void)RI;
+ assert(RI.Variable && "expected non-null variable");
+ assert(RI.PrivateVariable && "expected non-null private variable");
+ assert((RI.ReductionGen || RI.ReductionGenClang) &&
+ "expected non-null reduction generator callback");
+ if (!IsGPU) {
+ assert(
+ RI.Variable->getType() == RI.PrivateVariable->getType() &&
+ "expected variables and their private equivalents to have the same "
+ "type");
+ }
+ assert(RI.Variable->getType()->isPointerTy() &&
+ "expected variables to be pointers");
+ }
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+ bool IsSimdReduction,
+ ReductionGenCBTy ReductionGenCBTy, std::optional<omp::GV> GridValue,
+ unsigned ReductionBufNum, Value *SrcLocInfo) {
+
+ if (!updateToLocation(Loc))
+ return InsertPointTy();
+ Builder.restoreIP(CodeGenIP);
+ checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+ LLVMContext &Ctx = M.getContext();
+
+ // Source location for the ident struct
+ if (!SrcLocInfo) {
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+ }
+
+ if (ReductionInfos.size() == 0)
+ return Builder.saveIP();
+
+ Function *CurFunc = Builder.GetInsertBlock()->getParent();
+ AttributeList FuncAttrs;
+ AttrBuilder AttrBldr(Ctx);
+ for (auto Attr : CurFunc->getAttributes().getFnAttrs())
+ AttrBldr.addAttribute(Attr);
+ AttrBldr.removeAttribute(Attribute::OptimizeNone);
+ FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
+
+ Function *ReductionFunc = nullptr;
+ if (GLOBAL_ReductionFunc) {
+ ReductionFunc = GLOBAL_ReductionFunc;
+ } else {
+ CodeGenIP = Builder.saveIP();
+ ReductionFunc = createReductionFunction(
+ Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos,
+ ReductionGenCBTy, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
+ }
+
+ // Set the grid value in the config needed for lowering later on
+ if (GridValue.has_value())
+ Config.setGridValue(GridValue.value());
+ else
+ Config.setGridValue(getGridValue(T, ReductionFunc));
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+ Value *RTLoc =
+ getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
+
+ // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
+ // RedList, shuffle_reduce_func, interwarp_copy_func);
+ // or
+ // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+ Value *Res;
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+ auto Size = ReductionInfos.size();
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ Type *RedArrayTy = ArrayType::get(PtrTy, Size);
+ CodeGenIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ Value *ReductionListAlloca =
+ Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
+ Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
+ Builder.restoreIP(CodeGenIP);
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *ElemPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, ReductionList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ Value *CastElem =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+ Builder.CreateStore(CastElem, ElemPtr);
+ }
+ CodeGenIP = Builder.saveIP();
+ Function *SarFunc =
+ emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+ Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
+
+ Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
+
+ unsigned MaxDataSize = 0;
+ SmallVector<Type *> ReductionTypeArgs;
+ for (auto En : enumerate(ReductionInfos)) {
+ auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
+ if (Size > MaxDataSize)
+ MaxDataSize = Size;
+ ReductionTypeArgs.emplace_back(En.value().ElementType);
+ }
+ Value *ReductionDataSize =
+ Builder.getInt64(MaxDataSize * ReductionInfos.size());
+ if(IsSimdReduction) {
+ Value *SarFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+ Value *WcFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+ Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+ //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+ // RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+ Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2);
+ Res = Builder.CreateCall(SimdReduceFn, Args);
+ } else if (!IsTeamsReduction) {
+ Value *SarFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+ Value *WcFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+ Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+ Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+ Res = Builder.CreateCall(Pv2Ptr, Args);
+ } else {
+ CodeGenIP = Builder.saveIP();
+ StructType *ReductionsBufferTy = StructType::create(
+ Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
+ Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+ Function *LtGCFunc = emitListToGlobalCopyFunction(
+ ReductionInfos, ReductionsBufferTy, FuncAttrs);
+ Function *LtGRFunc = emitListToGlobalReduceFunction(
+ ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+ Function *GtLCFunc = emitGlobalToListCopyFunction(
+ ReductionInfos, ReductionsBufferTy, FuncAttrs);
+ Function *GtLRFunc = emitGlobalToListReduceFunction(
+ ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
+
+ Value *KernelTeamsReductionPtr = Builder.CreateCall(
+ RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
+
+ Value *Args3[] = {RTLoc,
+ KernelTeamsReductionPtr,
+ Builder.getInt32(ReductionBufNum),
+ ReductionDataSize,
+ RL,
+ SarFunc,
+ WcFunc,
+ LtGCFunc,
+ LtGRFunc,
+ GtLCFunc,
+ GtLRFunc};
+
+ Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
+ Res = Builder.CreateCall(TeamsReduceFn, Args3);
+ }
+
+ // 5. Build if (res == 1)
+ BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
+ Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+ Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+
+ // 6. Build then branch: where we have reduced values in the master
+ // thread in each team.
+ // __kmpc_end_reduce{_nowait}(<gtid>);
+ // break;
+ emitBlock(ThenBB, CurFunc);
+
+ // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *LHS = RI.Variable;
+ Value *RHS =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ Value *LHSPtr, *RHSPtr;
+ Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
+ &LHSPtr, &RHSPtr, CurFunc));
+
+ // Fix the CallBack code genereated to use the correct Values for the LHS
+ // and RHS
+ LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ } else {
+ // LHS = Builder.CreateLoad(LHS);
+ // LHS = Builder.CreateLoad(LHS);
+ // Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS));
+ }
+ }
+ emitBlock(ExitBB, CurFunc);
+
+ Config.setEmitLLVMUsed();
+
+ return Builder.saveIP();
+}
+
+static Function *getFreshReductionFunc(Module &M) {
Type *VoidTy = Type::getVoidTy(M.getContext());
Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
auto *FuncTy =
FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- M.getDataLayout().getDefaultGlobalsAddressSpace(),
".omp.reduction.func", &M);
}
+static void populateReductionFunction(
+ Function *ReductionFunc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ IRBuilder<> &Builder) {
+ Module *Module = ReductionFunc->getParent();
+ BasicBlock *ReductionFuncBlock =
+ BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+ Builder.SetInsertPoint(ReductionFuncBlock);
+ Value *LHSArrayPtr = nullptr;
+ Value *RHSArrayPtr = nullptr;
+ LHSArrayPtr = ReductionFunc->getArg(0);
+ RHSArrayPtr = ReductionFunc->getArg(1);
+
+ unsigned NumReductions = ReductionInfos.size();
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, LHSArrayPtr, 0, En.index());
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType());
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, RHSArrayPtr, 0, En.index());
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType());
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+ if (!Builder.GetInsertBlock())
+ return;
+ Builder.CreateStore(Reduced, LHSPtr);
+ }
+ Builder.CreateRetVoid();
+}
+
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const LocationDescription &Loc, InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
@@ -2126,6 +4193,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
if (!updateToLocation(Loc))
return InsertPointTy();
+ if (ReductionInfos.size() == 0)
+ return Builder.saveIP();
+
BasicBlock *InsertBlock = Loc.IP.getBlock();
BasicBlock *ContinuationBlock =
InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -2154,10 +4224,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Module *Module = Func->getParent();
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- bool CanGenerateAtomic =
- llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
- return RI.AtomicReductionGen;
- });
+ bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
+ return RI.AtomicReductionGen;
+ });
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
CanGenerateAtomic
? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
@@ -2167,7 +4236,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const DataLayout &DL = Module->getDataLayout();
unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
- Function *ReductionFunc = getFreshReductionFunc(*Module);
+ Function *ReductionFunc = getFreshReductionFunc(M);
Value *Lock = getOMPCriticalRegionLock(".reduction");
Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -4615,7 +6684,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
Ident,
DynamicEnvironment,
});
- Twine KernelEnvironmentName = KernelName + "_kernel_environment";
+ std::string KernelEnvironmentName =
+ (KernelName + "_kernel_environment").str();
GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
KernelEnvironmentInitializer, KernelEnvironmentName,
@@ -6575,6 +8645,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
[&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
const TargetRegionEntryInfo &EntryInfo,
const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
+
// Generate metadata for target regions. Each entry of this metadata
// contains:
// - Entry 0 -> Kind of this type of metadata (0).
@@ -6912,7 +8983,6 @@ void OpenMPIRBuilder::registerTargetGlobalVariable(
VarSize = M.getDataLayout().getPointerSize();
Linkage = GlobalValue::WeakAnyLinkage;
}
-
OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
Flags, Linkage);
}
@@ -7000,6 +9070,7 @@ bool OffloadEntriesInfoManager::empty() const {
unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
const TargetRegionEntryInfo &EntryInfo) const {
+
auto It = OffloadEntriesTargetRegionCount.find(
getTargetRegionEntryCountKey(EntryInfo));
if (It == OffloadEntriesTargetRegionCount.end())
@@ -7009,6 +9080,7 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
const TargetRegionEntryInfo &EntryInfo) {
+
OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
EntryInfo.Count + 1;
}
@@ -7016,6 +9088,7 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
/// Initialize target region entry.
void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
+
OffloadEntriesTargetRegion[EntryInfo] =
OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
OMPTargetRegionEntryTargetRegion);
@@ -7025,6 +9098,7 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
OMPTargetRegionEntryKind Flags) {
+
assert(EntryInfo.Count == 0 && "expected default EntryInfo");
// Update the EntryInfo with the next available count for this location.
@@ -7072,6 +9146,7 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
const OffloadTargetRegionEntryInfoActTy &Action) {
+
// Scan all target region entries and perform the provided action.
for (const auto &It : OffloadEntriesTargetRegion) {
Action(It.first, It.second);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 6988292ac7156..ec82b0a553a18 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1194,7 +1194,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
}
StructType *StructArgTy = nullptr;
- AllocaInst *Struct = nullptr;
+ //AllocaInst *Struct = nullptr;
+ Instruction *Struct = nullptr;
unsigned NumAggregatedInputs = 0;
if (AggregateArgs && !StructValues.empty()) {
std::vector<Type *> ArgTypes;
@@ -1210,12 +1211,16 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
auto *StructSpaceCast = new AddrSpaceCastInst(
- Struct, PointerType ::get(Context, 0), "structArg.ascast");
+ Struct, PointerType ::get(Context, 0), "structArg.ascast");
StructSpaceCast->insertAfter(Struct);
- params.push_back(StructSpaceCast);
+ // There isn't really a point in generating this cast if you
+ // just aren't going to use it...
+ Struct = StructSpaceCast;
+ //params.push_back(StructSpaceCast);
} else {
- params.push_back(Struct);
+ //params.push_back(Struct);
}
+ params.push_back(Struct);
// Store aggregated inputs in the struct.
for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
if (inputs.contains(StructValues[i])) {
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index f4854ed3d1678..fb6d54afdc505 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -167,6 +167,9 @@ double omp_get_wtick(void);
double omp_get_wtime(void);
///}
+
+int omp_get_simd_lane(void);
+
}
extern "C" {
@@ -233,6 +236,12 @@ void __kmpc_target_deinit();
///{
void *__kmpc_reduction_get_fixed_buffer();
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+ uint64_t reduce_data_size,
+ void *reduce_data,
+ ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct);
+
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
uint64_t reduce_data_size,
void *reduce_data,
@@ -257,6 +266,8 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
+void __kmpc_simd_barrier(void);
+
void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb9..948bfbbf0642a 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -105,6 +105,13 @@ uint32_t getMaxTeamThreads(bool IsSPMD);
/// Return the number of processing elements on the device.
uint32_t getNumberOfProcessorElements();
+uint32_t getSimdLen();
+uint32_t getSimdGroup();
+uint32_t getSimdLane();
+bool isSimdLeader();
+uint32_t getNumSimdGroups();
+LaneMaskTy simdmask();
+
} // namespace mapping
} // namespace ompx
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 95d4c728016d2..80b3877ff9df1 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -40,9 +40,9 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
static void genericStateMachine(IdentTy *Ident) {
uint32_t TId = mapping::getThreadIdInBlock();
+
do {
ParallelRegionFnTy WorkFn = nullptr;
-
// Wait for the signal that we have a new work function.
synchronize::threads(atomic::seq_cst);
@@ -98,7 +98,9 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
}
if (mapping::isInitialThreadInLevel0(IsSPMD))
+ {
return -1;
+ }
// Enter the generic state machine if enabled and if this thread can possibly
// be an active worker thread.
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f50..486a92d8f5217 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -322,6 +322,40 @@ uint32_t mapping::getNumberOfProcessorElements() {
return static_cast<uint32_t>(config::getHardwareParallelism());
}
+uint32_t mapping::getSimdLen() {
+ return 1;
+}
+
+uint32_t mapping::getSimdGroup() {
+ uint32_t SimdGroup = mapping::getThreadIdInBlock() / mapping::getSimdLen();
+ return SimdGroup;
+}
+
+uint32_t mapping::getSimdLane() {
+ uint32_t SimdId = mapping::getThreadIdInWarp() % mapping::getSimdLen();
+ return SimdId;
+}
+
+bool mapping::isSimdLeader() {
+ return !mapping::getSimdLane();
+}
+
+uint32_t mapping::getNumSimdGroups() {
+ //uint32_t NumGroups = mapping::getBlockSize() / mapping::getSimdLen();
+ uint32_t NumGroups = state::getEffectivePTeamSize() / mapping::getSimdLen();
+ return NumGroups;
+}
+
+LaneMaskTy mapping::simdmask() {
+ uint32_t GroupSize = mapping::getSimdLen();
+ uint32_t Group = mapping::getSimdGroup();
+ uint32_t WarpSize = mapping::getWarpSize();
+ LaneMaskTy Mask = ~(LaneMaskTy)0;
+ Mask = Mask >> (sizeof(LaneMaskTy)*8 - GroupSize);
+ Mask = Mask << (Group * GroupSize) % WarpSize;
+ return Mask;
+}
+
///}
/// Execution mode
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 031a5ced25518..dc0490830884b 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -87,8 +87,10 @@ extern "C" {
int32_t num_threads,
void *fn, void **args,
const int64_t nargs) {
+ //printf("SPMD mode\n");
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t NumThreads = determineNumberOfThreads(num_threads);
+ NumThreads = NumThreads / mapping::getSimdLen();
uint32_t PTeamSize =
NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
// Avoid the race between the read of the `icv::Level` above and the write
@@ -101,6 +103,9 @@ extern "C" {
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
1u, TId == 0, ident,
/*ForceTeamState=*/true);
+ //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+ // 1u, TId == 0, ident,
+ // /*ForceTeamState=*/true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
/*ForceTeamState=*/true);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
@@ -119,7 +124,7 @@ extern "C" {
// assumptions above.
synchronize::threadsAligned(atomic::relaxed);
- if (!PTeamSize || TId < PTeamSize)
+ if (!PTeamSize || (TId < PTeamSize*mapping::getSimdLen()))
invokeMicrotask(TId, 0, fn, args, nargs);
// Synchronize all threads at the end of a parallel region.
@@ -141,6 +146,8 @@ extern "C" {
return;
}
+
+
[[clang::always_inline]] void
__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
int32_t num_threads, int proc_bind, void *fn,
@@ -166,9 +173,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// From this point forward we know that there is no thread state used.
ASSERT(state::HasThreadState == false, nullptr);
+ //printf("num_threads=%i\n", num_threads);
uint32_t NumThreads = determineNumberOfThreads(num_threads);
+ //printf("NumThreads=%i\n", NumThreads);
+ NumThreads = NumThreads / mapping::getSimdLen();
+ //printf("New NumThreads=%i\n", NumThreads);
uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
+ //printf("PTeamSize=%i\n", PTeamSize);
if (mapping::isSPMDMode()) {
// This was moved to its own routine so it could be called directly
// in certain situations to avoid resource consumption of unused
@@ -184,7 +196,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// set, but they do not have individual ThreadStates yet. If they ever
// modify the ICVs beyond this point a ThreadStates will be allocated.
- bool IsActiveParallelRegion = NumThreads > 1;
+ bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1;
if (!IsActiveParallelRegion) {
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
invokeMicrotask(TId, 0, fn, args, nargs);
@@ -253,12 +265,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
}
{
+ //printf("Generic execution\n");
// Note that the order here is important. `icv::Level` has to be updated
// last or the other updates will cause a thread specific state to be
// created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
1u, true, ident,
/*ForceTeamState=*/true);
+ //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+ // 1u, TId == 0, ident,
+ // /*ForceTeamState=*/true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
(void *)nullptr, true, ident,
/*ForceTeamState=*/true);
@@ -287,7 +303,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// Set to true for workers participating in the parallel region.
uint32_t TId = mapping::getThreadIdInBlock();
- bool ThreadIsActive = TId < state::getEffectivePTeamSize();
+ bool ThreadIsActive = TId < state::getEffectivePTeamSize()*mapping::getSimdLen();
return ThreadIsActive;
}
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c8..2bfec6dcd6fae 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -164,9 +164,57 @@ uint32_t roundToWarpsize(uint32_t s) {
uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+static int32_t nvptx_simd_reduce_nowait(void *reduce_data,
+ ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct) {
+ uint32_t SimdId = mapping::getSimdLane();
+ uint32_t NumThreads = mapping::getSimdLen();
+ if(NumThreads == 1)
+ return 1;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ if (NumThreads == mapping::getWarpSize())
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/NumThreads,
+ /*LaneId=*/mapping::getSimdLane());
+#else
+ __kmpc_impl_lanemask_t Liveness = mapping::simdmask();
+ if (Liveness == lanes::All) // Full warp
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/utils::popc(Liveness),
+ /*LaneId=*/mapping::getSimdLane());
+#endif
+
+ return mapping::isSimdLeader();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
} // namespace
extern "C" {
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+ uint64_t reduce_data_size,
+ void *reduce_data,
+ ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct) {
+ return nvptx_simd_reduce_nowait(reduce_data, shflFct, cpyFct);
+}
+
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
uint64_t reduce_data_size,
void *reduce_data,
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index a1e4fa2449d9a..ba8e9a982dae7 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -364,13 +364,18 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
}
int omp_get_ancestor_thread_num(int Level) {
- return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+ //return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+ return returnValIfLevelIsActive(Level, mapping::getSimdGroup(), 0);
}
int omp_get_thread_num(void) {
return omp_get_ancestor_thread_num(omp_get_level());
}
+int omp_get_simd_lane(void) {
+ return mapping::getSimdLane();
+}
+
int omp_get_team_size(int Level) {
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
}
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcd..1d601657d391b 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -529,6 +529,10 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
impl::namedBarrier();
}
+void __kmpc_simd_barrier(void) {
+ synchronize::warp(mapping::simdmask());
+}
+
[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
}
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a18..658c26c24a3a8 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -468,6 +468,28 @@ static void popDST() {
ThreadDSTPtr = OldDST;
}
+template<typename IType>
+void SimdLoop(
+ IdentTy *ident, void *WorkFn, IType TripCount,
+ void **Args
+) {
+ ASSERT(WorkFn, "expected valid outlined function");
+ __kmpc_impl_lanemask_t SimdMask = mapping::simdmask();
+ uint32_t Step = mapping::getSimdLen();
+
+ //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n",
+ // mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount);
+
+ synchronize::warp(SimdMask);
+ for(IType omp_iv = (IType) mapping::getSimdLane();
+ omp_iv < TripCount;
+ omp_iv += Step
+ ) {
+ ((void (*)(IType, void**))WorkFn)(omp_iv, Args);
+ }
+ synchronize::warp(SimdMask);
+}
+
extern "C" {
// init
@@ -634,6 +656,28 @@ void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+
+void __kmpc_simd_4u(
+ IdentTy *ident, void *WorkFn, uint32_t TripCount,
+ void **Args
+) {
+ SimdLoop<uint32_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd_8u(
+ IdentTy *ident, void *WorkFn, uint64_t TripCount,
+ void **Args
+) {
+ SimdLoop<uint64_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd(
+ IdentTy *ident, void *WorkFn, void **Args, uint32_t nargs
+) {
+ ASSERT(WorkFn, "expected valid outlined function");
+ ((void (*)(void**))WorkFn)(Args);
+}
+
}
namespace ompx {
>From 1fbe13c40c28421a6a055c31217963acc2c5ea9b Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzvernal10.llnl.gov>
Date: Fri, 14 Jun 2024 08:44:22 -0700
Subject: [PATCH 2/2] Changed where certain variables are allocated so that it
is consistent
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 77f3f863e1e35..8e977d14c0e47 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1587,7 +1587,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
assert(DistanceCB && "expected loop trip count callback function!");
- Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+ //Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+ Value *DistVal = DistanceCB(OuterAllocaBlock, DistanceIP);
assert(DistVal && "trip count call back should return integer trip count");
Type *DistValType = DistVal->getType();
assert(DistValType->isIntegerTy() && "trip count should be integer type");
More information about the cfe-commits
mailing list