[llvm] 49ccf46 - [OpenMP] [IR Builder] Changes to Support Scan Operation (#136035)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 14:58:14 PDT 2025
Author: Anchu Rajendran S
Date: 2025-08-07T14:58:11-07:00
New Revision: 49ccf46adc455b64c2be0006092651182b1cb2c4
URL: https://github.com/llvm/llvm-project/commit/49ccf46adc455b64c2be0006092651182b1cb2c4
DIFF: https://github.com/llvm/llvm-project/commit/49ccf46adc455b64c2be0006092651182b1cb2c4.diff
LOG: [OpenMP] [IR Builder] Changes to Support Scan Operation (#136035)
Scan reductions are supported in OpenMP with the help of scan directive.
Reduction clause of the for loop/simd directive can take an `inscan`
modifier along with the body of the directive specifying a `scan`
directive. This PR implements the lowering logic for scan reductions in
workshare loops of OpenMP.
The body of the for loop is split into two loops (Input phase loop and
Scan Phase loop) and a scan reduction loop is added in the middle. The
Input phase loop populates a temporary buffer with initial values that
are to be reduced. The buffer is used by the reduction loop to perform
scan reduction. Scan phase loop copies the values of the buffer to the
reduction variable before executing the scan phase. Below is a high
level view of the code generated.
```
<declare pointer to buffer> ptr
omp parallel {
size num_iters = <num_iters>
// temp buffer allocation
omp masked {
buff = malloc(num_iters*scanvarstype)
*ptr = buff
}
barrier;
// input phase loop
for (i: 0..<num_iters>) {
<input phase>;
buffer = *ptr;
buffer[i] = red;
}
// scan reduction
omp masked
{
for (int k = 0; k != ceil(log2(num_iters)); ++k) {
i=pow(2,k)
for (size cnt = last_iter; cnt >= i; --cnt) {
buffer = *ptr;
buffer[cnt] op= buffer[cnt-i];
}
}
}
barrier;
// scan phase loop
for (0..<num_iters>) {
buffer = *ptr;
red = buffer[i] ;
<scan phase>;
}
// temp buffer deletion
omp masked {
free(*ptr)
}
barrier;
}
```
The temporary buffer needs to be shared between all threads performing
reduction since it is read/written in Input and Scan workshare Loops.
This is achieved by declaring a pointer to the buffer in the shared
region and dynamically allocating the buffer by the master thread.
This is the reason why allocation, deallocation and scan reduction are
performed within `masked`. The code is verified to produce correct
results for Fortran programs with the code changes in the PR
https://github.com/llvm/llvm-project/pull/133149
Added:
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b681ea8413726..f70659120e1e6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -31,6 +31,7 @@
namespace llvm {
class CanonicalLoopInfo;
+class ScanInfo;
struct TargetRegionEntryInfo;
class OffloadEntriesInfoManager;
class OpenMPIRBuilder;
@@ -707,6 +708,9 @@ class OpenMPIRBuilder {
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(
const LocationDescription &Loc, omp::Directive CanceledDirective);
+ /// Creates a ScanInfo object, allocates and returns the pointer.
+ Expected<ScanInfo *> scanInfoInitialize();
+
/// Generator for '#omp parallel'
///
/// \param Loc The insert and source location description.
@@ -750,6 +754,42 @@ class OpenMPIRBuilder {
LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
const Twine &Name = "loop");
+ /// Generator for the control flow structure of an OpenMP canonical loops if
+ /// the parent directive has an `inscan` modifier specified.
+ /// If the `inscan` modifier is specified, the region of the parent is
+ /// expected to have a `scan` directive. Based on the clauses in
+ /// scan directive, the body of the loop is split into two loops: Input loop
+ /// and Scan Loop. Input loop contains the code generated for input phase of
+ /// scan and Scan loop contains the code generated for scan phase of scan.
+ /// From the bodyGen callback of these loops, `createScan` would be called
+ /// when a scan directive is encountered from the loop body. `createScan`
+ /// based on whether 1. inclusive or exclusive scan is specified and, 2. input
+ /// loop or scan loop is generated, lowers the body of the for loop
+ /// accordingly.
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param BodyGenCB Callback that will generate the loop body code.
+ /// \param Start Value of the loop counter for the first iterations.
+ /// \param Stop Loop counter values past this will stop the loop.
+ /// \param Step Loop counter increment after each iteration; negative
+ /// means counting down.
+ /// \param IsSigned Whether Start, Stop and Step are signed integers.
+ /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
+ /// counter.
+ /// \param ComputeIP Insertion point for instructions computing the trip
+ /// count. Can be used to ensure the trip count is available
+ /// at the outermost loop of a loop nest. If not set,
+ /// defaults to the preheader of the generated loop.
+ /// \param Name Base name used to derive BB and instruction names.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \returns A vector containing Loop Info of Input Loop and Scan Loop.
+ Expected<SmallVector<llvm::CanonicalLoopInfo *>> createCanonicalScanLoops(
+ const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+ InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo);
+
/// Calculate the trip count of a canonical loop.
///
/// This allows specifying user-defined loop counter values using increment,
@@ -818,13 +858,17 @@ class OpenMPIRBuilder {
/// at the outermost loop of a loop nest. If not set,
/// defaults to the preheader of the generated loop.
/// \param Name Base name used to derive BB and instruction names.
+ /// \param InScan Whether loop has a scan reduction specified.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
///
/// \returns An object representing the created control flow structure which
/// can be used for loop-associated directives.
LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop(
const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
- InsertPointTy ComputeIP = {}, const Twine &Name = "loop");
+ InsertPointTy ComputeIP = {}, const Twine &Name = "loop",
+ bool InScan = false, ScanInfo *ScanRedInfo = nullptr);
/// Collapse a loop nest into a single loop.
///
@@ -1556,6 +1600,47 @@ class OpenMPIRBuilder {
ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
Function *ReduceFn, AttributeList FuncAttrs);
+ /// Helper function for CreateCanonicalScanLoops to create InputLoop
+ /// in the firstGen and Scan Loop in the SecondGen
+ /// \param InputLoopGen Callback for generating the loop for input phase
+ /// \param ScanLoopGen Callback for generating the loop for scan phase
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \return error if any produced, else return success.
+ Error emitScanBasedDirectiveIR(
+ llvm::function_ref<Error()> InputLoopGen,
+ llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
+ ScanInfo *ScanRedInfo);
+
+ /// Creates the basic blocks required for scan reduction.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ void createScanBBs(ScanInfo *ScanRedInfo);
+
+ /// Dynamically allocates the buffer needed for scan reduction.
+ /// \param AllocaIP The IP where possibly-shared pointer of buffer needs to
+ /// be declared.
+ /// \param ScanVars Scan Variables.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \return error if any produced, else return success.
+ Error emitScanBasedDirectiveDeclsIR(InsertPointTy AllocaIP,
+ ArrayRef<llvm::Value *> ScanVars,
+ ArrayRef<llvm::Type *> ScanVarsType,
+ ScanInfo *ScanRedInfo);
+
+ /// Copies the result back to the reduction variable.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \return error if any produced, else return success.
+ Error emitScanBasedDirectiveFinalsIR(
+ ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ ScanInfo *ScanInfo);
+
/// This function emits a helper that gathers Reduce lists from the first
/// lane of every active warp to lanes in the first warp.
///
@@ -2184,6 +2269,9 @@ class OpenMPIRBuilder {
/// free'd.
std::forward_list<CanonicalLoopInfo> LoopInfos;
+ /// Collection of owned ScanInfo objects that eventually need to be free'd.
+ std::forward_list<ScanInfo> ScanInfos;
+
/// Add a new region that will be outlined later.
void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
@@ -2639,6 +2727,48 @@ class OpenMPIRBuilder {
FinalizeCallbackTy FiniCB,
Value *Filter);
+ /// This function performs the scan reduction of the values updated in
+ /// the input phase. The reduction logic needs to be emitted between input
+ /// and scan loop returned by `CreateCanonicalScanLoops`. The following
+ /// is the code that is generated, `buffer` and `span` are expected to be
+ /// populated before executing the generated code.
+ /// \code{c}
+ /// for (int k = 0; k != ceil(log2(span)); ++k) {
+ /// i=pow(2,k)
+ /// for (size cnt = last_iter; cnt >= i; --cnt)
+ /// buffer[cnt] op= buffer[cnt-i];
+ /// }
+ /// \endcode
+ /// \param Loc The insert and source location description.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \returns The insertion position *after* the masked.
+ InsertPointOrErrorTy emitScanReduction(
+ const LocationDescription &Loc,
+ ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ ScanInfo *ScanRedInfo);
+
+ /// This directive split and directs the control flow to input phase
+ /// blocks or scan phase blocks based on 1. whether input loop or scan loop
+ /// is executed, 2. whether exclusive or inclusive scan is used.
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param AllocaIP The IP where the temporary buffer for scan reduction
+ // needs to be allocated.
+ /// \param ScanVars Scan Variables.
+ /// \param IsInclusive Whether it is an inclusive or exclusive scan.
+ /// \param ScanRedInfo Pointer to the ScanInfo objected created using
+ /// `ScanInfoInitialize`.
+ ///
+ /// \returns The insertion position *after* the scan.
+ InsertPointOrErrorTy createScan(const LocationDescription &Loc,
+ InsertPointTy AllocaIP,
+ ArrayRef<llvm::Value *> ScanVars,
+ ArrayRef<llvm::Type *> ScanVarsType,
+ bool IsInclusive, ScanInfo *ScanRedInfo);
+
/// Generator for '#omp critical'
///
/// \param Loc The insert and source location description.
@@ -3779,6 +3909,93 @@ class CanonicalLoopInfo {
LLVM_ABI void invalidate();
};
+/// ScanInfo holds the information to assist in lowering of Scan reduction.
+/// Before lowering, the body of the for loop specifying scan reduction is
+/// expected to have the following structure
+///
+/// Loop Body Entry
+/// |
+/// Code before the scan directive
+/// |
+/// Scan Directive
+/// |
+/// Code after the scan directive
+/// |
+/// Loop Body Exit
+/// When `createCanonicalScanLoops` is executed, the bodyGen callback of it
+/// transforms the body to:
+///
+/// Loop Body Entry
+/// |
+/// OMPScanDispatch
+///
+/// OMPBeforeScanBlock
+/// |
+/// OMPScanLoopExit
+/// |
+/// Loop Body Exit
+///
+/// The insert point is updated to the first insert point of OMPBeforeScanBlock.
+/// It dominates the control flow of code generated until
+/// scan directive is encountered and OMPAfterScanBlock dominates the
+/// control flow of code generated after scan is encountered. The successor
+/// of OMPScanDispatch can be OMPBeforeScanBlock or OMPAfterScanBlock based
+/// on 1.whether it is in Input phase or Scan Phase , 2. whether it is an
+/// exclusive or inclusive scan. This jump is added when `createScan` is
+/// executed. If input loop is being generated, if it is inclusive scan,
+/// `OMPAfterScanBlock` succeeds `OMPScanDispatch` , if exclusive,
+/// `OMPBeforeScanBlock` succeeds `OMPDispatch` and vice versa for scan loop. At
+/// the end of the input loop, temporary buffer is populated and at the
+/// beginning of the scan loop, temporary buffer is read. After scan directive
+/// is encountered, insertion point is updated to `OMPAfterScanBlock` as it is
+/// expected to dominate the code after the scan directive. Both Before and
+/// After scan blocks are succeeded by `OMPScanLoopExit`.
+/// Temporary buffer allocations are done in `ScanLoopInit` block before the
+/// lowering of for-loop. The results are copied back to reduction variable in
+/// `ScanLoopFinish` block.
+class ScanInfo {
+public:
+ /// Dominates the body of the loop before scan directive
+ llvm::BasicBlock *OMPBeforeScanBlock = nullptr;
+
+ /// Dominates the body of the loop before scan directive
+ llvm::BasicBlock *OMPAfterScanBlock = nullptr;
+
+ /// Controls the flow to before or after scan blocks
+ llvm::BasicBlock *OMPScanDispatch = nullptr;
+
+ /// Exit block of loop body
+ llvm::BasicBlock *OMPScanLoopExit = nullptr;
+
+ /// Block before loop body where scan initializations are done
+ llvm::BasicBlock *OMPScanInit = nullptr;
+
+ /// Block after loop body where scan finalizations are done
+ llvm::BasicBlock *OMPScanFinish = nullptr;
+
+ /// If true, it indicates Input phase is lowered; else it indicates
+ /// ScanPhase is lowered
+ bool OMPFirstScanLoop = false;
+
+ /// Maps the private reduction variable to the pointer of the temporary
+ /// buffer
+ llvm::SmallDenseMap<llvm::Value *, llvm::Value *> *ScanBuffPtrs;
+
+ /// Keeps track of value of iteration variable for input/scan loop to be
+ /// used for Scan directive lowering
+ llvm::Value *IV;
+
+ /// Stores the span of canonical loop being lowered to be used for temporary
+ /// buffer allocation or Finalization.
+ llvm::Value *Span;
+
+ ScanInfo() {
+ ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
+ }
+
+ ~ScanInfo() { delete (ScanBuffPtrs); }
+};
+
} // end namespace llvm
#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 260d3c292e56b..ea027e48fa2f1 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4014,6 +4014,340 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
/*Conditional*/ true, /*hasFinalize*/ true);
}
+static llvm::CallInst *emitNoUnwindRuntimeCall(IRBuilder<> &Builder,
+ llvm::FunctionCallee Callee,
+ ArrayRef<llvm::Value *> Args,
+ const llvm::Twine &Name) {
+ llvm::CallInst *Call = Builder.CreateCall(
+ Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
+ Call->setDoesNotThrow();
+ return Call;
+}
+
+// Expects input basic block is dominated by BeforeScanBB.
+// Once Scan directive is encountered, the code after scan directive should be
+// dominated by AfterScanBB. Scan directive splits the code sequence to
+// scan and input phase. Based on whether inclusive or exclusive
+// clause is used in the scan directive and whether input loop or scan loop
+// is lowered, it adds jumps to input and scan phase. First Scan loop is the
+// input loop and second is the scan loop. The code generated handles only
+// inclusive scans now.
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
+ bool IsInclusive, ScanInfo *ScanRedInfo) {
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
+ ScanVarsType, ScanRedInfo);
+ if (Err)
+ return Err;
+ }
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+
+ llvm::Value *IV = ScanRedInfo->IV;
+
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ // Emit buffer[i] = red; at the end of the input phase.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
+
+ Builder.CreateStore(Src, Val);
+ }
+ }
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanDispatch,
+ Builder.GetInsertBlock()->getParent());
+
+ if (!ScanRedInfo->OMPFirstScanLoop) {
+ IV = ScanRedInfo->IV;
+ // Emit red = buffer[i]; at the entrance to the scan phase.
+ // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *SrcPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
+ Builder.CreateStore(Src, ScanVars[i]);
+ }
+ }
+
+ // TODO: Update it to CreateBr and remove dead blocks
+ llvm::Value *CmpI = Builder.getInt1(true);
+ if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
+ ScanRedInfo->OMPAfterScanBlock);
+ } else {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
+ ScanRedInfo->OMPBeforeScanBlock);
+ }
+ emitBlock(ScanRedInfo->OMPAfterScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
+ return Builder.saveIP();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
+ InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
+ ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
+
+ Builder.restoreIP(AllocaIP);
+ // Create the shared pointer at alloca IP.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ llvm::Value *BuffPtr =
+ Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
+ (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
+ }
+
+ // Allocate temporary buffer by master thread
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Value *AllocSpan =
+ Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Type *IntPtrTy = Builder.getInt32Ty();
+ Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
+ Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
+ Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
+ AllocSpan, nullptr, "arr");
+ Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+
+ return Error::success();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
+ ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *PrivateVar = RedInfo.PrivateVariable;
+ Value *OrigVar = RedInfo.Variable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+
+ Type *SrcTy = RedInfo.ElementType;
+ Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
+ "arrayOffset");
+ Value *Src = Builder.CreateLoad(SrcTy, Val);
+
+ Builder.CreateStore(Src, OrigVar);
+ Builder.CreateFree(Buff);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ if (ScanRedInfo->OMPScanFinish->getTerminator())
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
+ else
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ return Error::success();
+}
+
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
+ const LocationDescription &Loc,
+ ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ ScanInfo *ScanRedInfo) {
+
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Function *CurFn = Builder.GetInsertBlock()->getParent();
+ // for (int k = 0; k <= ceil(log2(n)); ++k)
+ llvm::BasicBlock *LoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
+ llvm::BasicBlock *ExitBB =
+ splitBB(Builder, false, "omp.outer.log.scan.exit");
+ llvm::Function *F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
+ llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
+ llvm::Value *Arg =
+ Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
+ llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
+ F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
+ LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
+ LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
+ llvm::Value *NMin1 = Builder.CreateNUWSub(
+ ScanRedInfo->Span,
+ llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
+ Builder.SetInsertPoint(InputBB);
+ Builder.CreateBr(LoopBB);
+ emitBlock(LoopBB, CurFn);
+ Builder.SetInsertPoint(LoopBB);
+
+ PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ // size pow2k = 1;
+ PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
+ InputBB);
+ Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
+ InputBB);
+ // for (size i = n - 1; i >= 2 ^ k; --i)
+ // tmp[i] op= tmp[i-pow2k];
+ llvm::BasicBlock *InnerLoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
+ llvm::BasicBlock *InnerExitBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
+ llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerLoopBB, CurFn);
+ Builder.SetInsertPoint(InnerLoopBB);
+ PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ IVal->addIncoming(NMin1, LoopBB);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *ReductionVal = RedInfo.PrivateVariable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = RedInfo.ElementType;
+ Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
+ Value *LHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
+ Value *RHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
+ Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
+ Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
+ llvm::Value *Result;
+ InsertPointOrErrorTy AfterIP =
+ RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.CreateStore(Result, LHSPtr);
+ }
+ llvm::Value *NextIVal = Builder.CreateNUWSub(
+ IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
+ IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
+ CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerExitBB, CurFn);
+ llvm::Value *Next = Builder.CreateNUWAdd(
+ Counter, llvm::ConstantInt::get(Counter->getType(), 1));
+ Counter->addIncoming(Next, Builder.GetInsertBlock());
+ // pow2k <<= 1;
+ llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
+ Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
+ llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
+ Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
+ Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
+ return Error::success();
+ };
+
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
+ if (Err)
+ return Err;
+
+ return AfterIP;
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
+ llvm::function_ref<Error()> InputLoopGen,
+ llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
+ ScanInfo *ScanRedInfo) {
+
+ {
+ // Emit loop with input phase:
+ // for (i: 0..<num_iters>) {
+ // <input phase>;
+ // buffer[i] = red;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = true;
+ Error Err = InputLoopGen();
+ if (Err)
+ return Err;
+ }
+ {
+ // Emit loop with scan phase:
+ // for (i: 0..<num_iters>) {
+ // red = buffer[i];
+ // <scan phase>;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = false;
+ Error Err = ScanLoopGen(Builder.saveIP());
+ if (Err)
+ return Err;
+ }
+ return Error::success();
+}
+
+void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
+ Function *Fun = Builder.GetInsertBlock()->getParent();
+ ScanRedInfo->OMPScanDispatch =
+ BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
+ ScanRedInfo->OMPAfterScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
+ ScanRedInfo->OMPBeforeScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
+ ScanRedInfo->OMPScanLoopExit =
+ BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
+}
CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
BasicBlock *PostInsertBefore, const Twine &Name) {
@@ -4111,6 +4445,76 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
return CL;
}
+Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
+ ScanInfos.emplace_front();
+ ScanInfo *Result = &ScanInfos.front();
+ return Result;
+}
+
+Expected<SmallVector<llvm::CanonicalLoopInfo *>>
+OpenMPIRBuilder::createCanonicalScanLoops(
+ const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+ InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
+ LocationDescription ComputeLoc =
+ ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
+ updateToLocation(ComputeLoc);
+
+ SmallVector<CanonicalLoopInfo *> Result;
+
+ Value *TripCount = calculateCanonicalLoopTripCount(
+ ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
+ ScanRedInfo->Span = TripCount;
+ ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
+
+ auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
+ Builder.restoreIP(CodeGenIP);
+ ScanRedInfo->IV = IV;
+ createScanBBs(ScanRedInfo);
+ BasicBlock *InputBlock = Builder.GetInsertBlock();
+ Instruction *Terminator = InputBlock->getTerminator();
+ assert(Terminator->getNumSuccessors() == 1);
+ BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
+ Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
+ emitBlock(ScanRedInfo->OMPBeforeScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanLoopExit,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ContinueBlock);
+ Builder.SetInsertPoint(
+ ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
+ return BodyGenCB(Builder.saveIP(), IV);
+ };
+
+ const auto &&InputLoopGen = [&]() -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
+ Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
+ ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ return Error::success();
+ };
+ const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo =
+ createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
+ InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
+ return Error::success();
+ };
+ Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
+ if (Err)
+ return Err;
+ return Result;
+}
+
Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
bool IsSigned, bool InclusiveStop, const Twine &Name) {
@@ -4174,7 +4578,8 @@ Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
- InsertPointTy ComputeIP, const Twine &Name) {
+ InsertPointTy ComputeIP, const Twine &Name, bool InScan,
+ ScanInfo *ScanRedInfo) {
LocationDescription ComputeLoc =
ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
@@ -4185,6 +4590,8 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
Builder.restoreIP(CodeGenIP);
Value *Span = Builder.CreateMul(IV, Step);
Value *IndVar = Builder.CreateAdd(Span, Start);
+ if (InScan)
+ ScanRedInfo->IV = IndVar;
return BodyGenCB(Builder.saveIP(), IndVar);
};
LocationDescription LoopLoc =
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d6b578aa8ffd1..b7a060bb3563d 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -23,6 +23,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include <cstdlib>
#include <optional>
using namespace llvm;
@@ -5360,6 +5361,144 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
EXPECT_TRUE(findGEPZeroOne(ReductionFn->getArg(1), FirstRHS, SecondRHS));
}
+static void createScan(llvm::Value *scanVar, llvm::Type *scanType,
+ OpenMPIRBuilder &OMPBuilder, IRBuilder<> &Builder,
+ OpenMPIRBuilder::LocationDescription Loc,
+ OpenMPIRBuilder::InsertPointTy &allocaIP,
+ ScanInfo *&ScanRedInfo) {
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ ASSERT_EXPECTED_INIT(InsertPointTy, retIp,
+ OMPBuilder.createScan(Loc, allocaIP, {scanVar},
+ {scanType}, true, ScanRedInfo));
+ Builder.restoreIP(retIp);
+}
+/*
+ Following is the pseudocode of the code generated by the test case
+ <declare pointer to buffer> ptr
+ size num_iters = 100
+ // temp buffer allocation
+ omp masked {
+ buff = malloc(num_iters*scanvarstype)
+ *ptr = buff
+ }
+ barrier;
+ // input phase loop
+ for (i: 0..<num_iters>) {
+ <input phase>;
+ buffer = *ptr;
+ buffer[i] = red;
+ }
+ // scan reduction
+ omp masked
+ {
+ for (int k = 0; k != ceil(log2(num_iters)); ++k) {
+ i=pow(2,k)
+ for (size cnt = last_iter; cnt >= i; --cnt) {
+ buffer = *ptr;
+ buffer[cnt] op= buffer[cnt-i];
+ }
+ }
+ }
+ barrier;
+ // scan phase loop
+ for (0..<num_iters>) {
+ buffer = *ptr;
+ red = buffer[i] ;
+ <scan phase>;
+ }
+ // temp buffer deletion
+ omp masked {
+ free(*ptr)
+ }
+ barrier;
+*/
+TEST_F(OpenMPIRBuilderTest, ScanReduction) {
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.initialize();
+ IRBuilder<> Builder(BB);
+ OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+ Value *TripCount = F->getArg(0);
+ Type *LCTy = TripCount->getType();
+ Value *StartVal = ConstantInt::get(LCTy, 1);
+ Value *StopVal = ConstantInt::get(LCTy, 100);
+ Value *Step = ConstantInt::get(LCTy, 1);
+ auto AllocaIP = Builder.saveIP();
+
+ llvm::Value *ScanVar = Builder.CreateAlloca(Builder.getFloatTy());
+ llvm::Value *OrigVar = Builder.CreateAlloca(Builder.getFloatTy());
+ unsigned NumBodiesGenerated = 0;
+ ScanInfo *ScanRedInfo;
+ ASSERT_EXPECTED_INIT(ScanInfo *, ScanInformation,
+ OMPBuilder.scanInfoInitialize());
+ ScanRedInfo = ScanInformation;
+ auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, llvm::Value *LC) {
+ NumBodiesGenerated += 1;
+ Builder.restoreIP(CodeGenIP);
+ createScan(ScanVar, Builder.getFloatTy(), OMPBuilder, Builder, Loc,
+ AllocaIP, ScanRedInfo);
+ return Error::success();
+ };
+ llvm::SmallVector<CanonicalLoopInfo *> loops;
+ ASSERT_EXPECTED_INIT(llvm::SmallVector<CanonicalLoopInfo *>, loopvec,
+ OMPBuilder.createCanonicalScanLoops(
+ Loc, LoopBodyGenCB, StartVal, StopVal, Step, false,
+ false, Builder.saveIP(), "scan", ScanRedInfo));
+ loops = loopvec;
+ CanonicalLoopInfo *InputLoop = loops.front();
+ CanonicalLoopInfo *ScanLoop = loops.back();
+ Builder.restoreIP(ScanLoop->getAfterIP());
+ InputLoop->assertOK();
+ ScanLoop->assertOK();
+
+ EXPECT_EQ(ScanLoop->getAfter(), Builder.GetInsertBlock());
+ EXPECT_EQ(NumBodiesGenerated, 2U);
+ SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos = {
+ {Builder.getFloatTy(), OrigVar, ScanVar,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction,
+ /*ReductionGenClang=*/nullptr, sumAtomicReduction}};
+ OpenMPIRBuilder::LocationDescription RedLoc({InputLoop->getAfterIP(), DL});
+ llvm::BasicBlock *Cont = splitBB(Builder, false, "omp.scan.loop.cont");
+ ASSERT_EXPECTED_INIT(
+ InsertPointTy, retIp,
+ OMPBuilder.emitScanReduction(RedLoc, ReductionInfos, ScanRedInfo));
+ Builder.restoreIP(retIp);
+ Builder.CreateBr(Cont);
+ Builder.SetInsertPoint(Cont);
+ unsigned NumMallocs = 0;
+ unsigned NumFrees = 0;
+ unsigned NumMasked = 0;
+ unsigned NumEndMasked = 0;
+ unsigned NumLog = 0;
+ unsigned NumCeil = 0;
+ for (Instruction &I : instructions(F)) {
+ if (!isa<CallInst>(I))
+ continue;
+ CallInst *Call = dyn_cast<CallInst>(&I);
+ StringRef Name = Call->getCalledFunction()->getName();
+ if (Name.equals_insensitive("malloc")) {
+ NumMallocs += 1;
+ } else if (Name.equals_insensitive("free")) {
+ NumFrees += 1;
+ } else if (Name.equals_insensitive("__kmpc_masked")) {
+ NumMasked += 1;
+ } else if (Name.equals_insensitive("__kmpc_end_masked")) {
+ NumEndMasked += 1;
+ } else if (Name.equals_insensitive("llvm.log2.f64")) {
+ NumLog += 1;
+ } else if (Name.equals_insensitive("llvm.ceil.f64")) {
+ NumCeil += 1;
+ }
+ }
+ EXPECT_EQ(NumBodiesGenerated, 2U);
+ EXPECT_EQ(NumMasked, 3U);
+ EXPECT_EQ(NumEndMasked, 3U);
+ EXPECT_EQ(NumMallocs, 1U);
+ EXPECT_EQ(NumFrees, 1U);
+ EXPECT_EQ(NumLog, 1U);
+ EXPECT_EQ(NumCeil, 1U);
+}
+
TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
More information about the llvm-commits
mailing list