[clang] [llvm] [mlir] [OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder (PR #80343)
Akash Banerjee via cfe-commits
cfe-commits at lists.llvm.org
Tue Apr 2 09:02:36 PDT 2024
https://github.com/TIFitis updated https://github.com/llvm/llvm-project/pull/80343
>From 960f946643c3d96c1f1f178c3800b5b889050d0a Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Mon, 15 Jan 2024 11:34:09 +0000
Subject: [PATCH 01/18] Jan migration work
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 109 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1216 ++++++++++++++++-
2 files changed, 1304 insertions(+), 21 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index c9ee0c25194c23..e1dba2339338a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -16,6 +16,7 @@
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Allocator.h"
@@ -99,14 +100,22 @@ class OpenMPIRBuilderConfig {
/// expanded.
std::optional<bool> IsGPU;
- // Flag for specifying if offloading is mandatory.
+ /// Flag for specifying if offloading is mandatory.
std::optional<bool> OpenMPOffloadMandatory;
+ /// Name of the target processor.
+ StringRef TargetCPU;
+ /// String representation of the target processor's features.
+ StringRef TargetFeatures;
+
/// First separator used between the initial two parts of a name.
std::optional<StringRef> FirstSeparator;
/// Separator used between all of the rest consecutive parts of s name
std::optional<StringRef> Separator;
+ // Grid Value for the GPU target
+ std::optional<omp::GV> GridValue;
+
OpenMPIRBuilderConfig();
OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
bool OpenMPOffloadMandatory,
@@ -132,6 +141,11 @@ class OpenMPIRBuilderConfig {
return *OpenMPOffloadMandatory;
}
+ omp::GV getGridValue() const {
+ assert(GridValue.has_value() && "GridValue is not set");
+ return *GridValue;
+ }
+
bool hasRequiresFlags() const { return RequiresFlags; }
bool hasRequiresReverseOffload() const;
bool hasRequiresUnifiedAddress() const;
@@ -167,6 +181,7 @@ class OpenMPIRBuilderConfig {
void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
void setSeparator(StringRef S) { Separator = S; }
+ void setGridValue(omp::GV G) { GridValue = G; }
void setHasRequiresReverseOffload(bool Value);
void setHasRequiresUnifiedAddress(bool Value);
@@ -1235,18 +1250,26 @@ class OpenMPIRBuilder {
getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
StringRef ParentName = "");
+ // using ReductionGenTy =
+ // function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+
+ // using AtomicReductionGenTy =
+ // function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+ /// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used
+ /// to
+ /// store lambdas with capture.
/// Functions used to generate reductions. Such functions take two Values
/// representing LHS and RHS of the reduction, respectively, and a reference
/// to the value that is updated to refer to the reduction result.
- using ReductionGenTy =
- function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
-
+ using ReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
+ OpenMPIRBuilder::InsertPointTy, Value *, Value *, Value *&)>;
/// Functions used to generate atomic reductions. Such functions take two
/// Values representing pointers to LHS and RHS of the reduction, as well as
/// the element type of these pointers. They are expected to atomically
/// update the LHS to the reduced value.
- using AtomicReductionGenTy =
- function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+ using AtomicReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
+ OpenMPIRBuilder::InsertPointTy, Type *, Value *, Value *)>;
/// Information about an OpenMP reduction.
struct ReductionInfo {
@@ -1256,6 +1279,10 @@ class OpenMPIRBuilder {
: ElementType(ElementType), Variable(Variable),
PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
AtomicReductionGen(AtomicReductionGen) {}
+ ReductionInfo(Value *PrivateVariable)
+ : ElementType(nullptr), Variable(nullptr),
+ PrivateVariable(PrivateVariable), ReductionGen(),
+ AtomicReductionGen() {}
/// Reduction element type, must match pointee type of variable.
Type *ElementType;
@@ -1278,6 +1305,69 @@ class OpenMPIRBuilder {
AtomicReductionGenTy AtomicReductionGen;
};
+ /// A class that manages the reduction info to facilitate lowering of
+ /// reductions at multiple levels of parallelism. For example handling teams
+ /// and parallel reductions on GPUs
+
+ class ReductionInfoManager {
+ private:
+ SmallVector<ReductionInfo> ReductionInfos;
+ std::optional<InsertPointTy> PrivateVarAllocaIP;
+
+ public:
+ ReductionInfoManager(){};
+ void clear() {
+ ReductionInfos.clear();
+ PrivateVarAllocaIP.reset();
+ }
+
+ Value *
+ allocatePrivateReductionVar(IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+ Type *VarType) {
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
+ llvm::Value *var = builder.CreateAlloca(VarType);
+ var->setName("private_redvar");
+ llvm::Value *castVar =
+ builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+ ReductionInfos.push_back(ReductionInfo(castVar));
+ return castVar;
+ }
+
+ ReductionInfo getReductionInfo(unsigned Index) {
+ return ReductionInfos[Index];
+ }
+ ReductionInfo setReductionInfo(unsigned Index, ReductionInfo &RI) {
+ return ReductionInfos[Index] = RI;
+ }
+ Value *getPrivateReductionVariable(unsigned Index) {
+ return ReductionInfos[Index].PrivateVariable;
+ }
+ SmallVector<ReductionInfo> &getReductionInfos() { return ReductionInfos; }
+
+ bool hasPrivateVarAllocaIP() { return PrivateVarAllocaIP.has_value(); }
+ InsertPointTy getPrivateVarAllocaIP() {
+ assert(PrivateVarAllocaIP.has_value() && "AllocaIP not set");
+ return *PrivateVarAllocaIP;
+ }
+ void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
+ };
+
+ /// \param Loc The location where the reduction was
+ /// encountered. Must be within the associate
+ /// directive and after the last local access to the
+ /// reduction variables.
+ /// \param AllocaIP An insertion point suitable for allocas usable
+ /// in reductions.
+ /// \param ReductionInfos A list of info on each reduction variable.
+ /// \param IsNoWait A flag set if the reduction is marked as nowait.
+ InsertPointTy createReductionsGPU(const LocationDescription &Loc,
+ InsertPointTy AllocaIP,
+ ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait = false,
+ bool IsTeamsReduction = false,
+ bool HasDistribute = false);
+
// TODO: provide atomic and non-atomic reduction generators for reduction
// operators defined by the OpenMP specification.
@@ -1344,7 +1434,9 @@ class OpenMPIRBuilder {
InsertPointTy createReductions(const LocationDescription &Loc,
InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait = false, bool IsByRef = false);
+ bool IsNoWait = false, bool IsByRef = false,
+ bool IsTeamsReduction = false,
+ bool HasDistribute = false);
///}
@@ -1485,6 +1577,9 @@ class OpenMPIRBuilder {
/// Info manager to keep track of target regions.
OffloadEntriesInfoManager OffloadInfoManager;
+ /// Info manager to keep track of reduction information;
+ ReductionInfoManager RIManager;
+
/// The target triple of the underlying module.
const Triple T;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 16507a69ea8502..2b48df24e7e734 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -145,10 +145,19 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
}
#endif
-static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+Function *GLOBAL_ReductionFunc = nullptr;
+
+static uint64_t getTypeSizeInBytes(Module &M, Type *Type) {
+ return divideCeil(M.getDataLayout().getTypeSizeInBits(Type), 8);
+}
+
+static Value *getTypeSizeInBytesValue(IRBuilder<> &Builder, Module &M,
+ Type *Type) {
+ return Builder.getInt64(getTypeSizeInBytes(M, Type));
+}
+
+static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
if (T.isAMDGPU()) {
- StringRef Features =
- Kernel->getFnAttribute("target-features").getValueAsString();
if (Features.count("+wavefrontsize64"))
return omp::getAMDGPUGridValues<64>();
return omp::getAMDGPUGridValues<32>();
@@ -158,6 +167,17 @@ static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
llvm_unreachable("No grid value available for this architecture!");
}
+static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+ if (T.isAMDGPU()) {
+ StringRef Features =
+ Kernel->getFnAttribute("target-features").getValueAsString();
+ return getGridValue(T, Features);
+ }
+ if (T.isNVPTX())
+ return omp::NVPTXGridValues;
+ llvm_unreachable("No grid value available for this architecture!");
+}
+
/// Determine which scheduling algorithm to use, determined from schedule clause
/// arguments.
static OMPScheduleType
@@ -2096,36 +2116,1182 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
/*IsCancellable*/ true);
}
+static Value *getGPUWarpSize(Module &M, OpenMPIRBuilder &OMPBuilder) {
+ return OMPBuilder.Builder.CreateCall(
+ OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size),
+ {});
+}
+
+static Value *getGPUThreadID(Module &M, OpenMPIRBuilder &OMPBuilder) {
+ return OMPBuilder.Builder.CreateCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ M, OMPRTL___kmpc_get_hardware_thread_id_in_block),
+ {});
+}
+
+static Value *getGPUNumThreads(Module &M, OpenMPIRBuilder &OMPBuilder) {
+ const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
+ llvm::Function *F = M.getFunction(LocSize);
+ if (!F) {
+ LLVMContext &Ctx = M.getContext();
+ Type *I32Type = Type::getInt32Ty(Ctx);
+
+ F = Function::Create(FunctionType::get(I32Type, std::nullopt, false),
+ GlobalVariable::ExternalLinkage, LocSize, M);
+ }
+ return OMPBuilder.Builder.CreateCall(F, std::nullopt, "nvptx_num_threads");
+}
+
+static Value *getNVPTXWarpID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
+ unsigned LaneIDBits =
+ llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+ return OMPIRBuilder.Builder.CreateAShr(getGPUThreadID(M, OMPIRBuilder),
+ LaneIDBits, "nvptx_warp_id");
+}
+
+static Value *getNVPTXLaneID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
+ unsigned LaneIDBits =
+ llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+ assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
+ unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
+ return OMPIRBuilder.Builder.CreateAnd(
+ getGPUThreadID(M, OMPIRBuilder),
+ OMPIRBuilder.Builder.getInt32(LaneIDMask), "nvptx_lane_id");
+}
+
+namespace {
+enum CopyAction : unsigned {
+ // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+ // the warp using shuffle instructions.
+ RemoteLaneToThread,
+ // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+ ThreadCopy,
+};
+} // namespace
+
+struct CopyOptionsTy {
+ llvm::Value *RemoteLaneOffset;
+ llvm::Value *ScratchpadIndex;
+ llvm::Value *ScratchpadWidth;
+};
+
+static Value *castValueToType(Module &M, OpenMPIRBuilder &OMPBuilder,
+ Value *From, Type *ToType,
+ OpenMPIRBuilder::InsertPointTy AllocaIP,
+ const OpenMPIRBuilder::LocationDescription &Loc) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ Type *FromType = From->getType();
+ uint64_t FromSize =
+ divideCeil(M.getDataLayout().getTypeSizeInBits(FromType), 8);
+ uint64_t ToSize = divideCeil(M.getDataLayout().getTypeSizeInBits(ToType), 8);
+ assert(FromSize > 0 && "From size must be greater than zero");
+ assert(ToSize > 0 && "From size must be greater than zero");
+ if (FromType == ToType)
+ return From;
+ if (FromSize == ToSize)
+ return Builder.CreateBitCast(From, ToType);
+ if (ToType->isIntegerTy() && FromType->isIntegerTy())
+ // FIXME(JAN): Assuming signed integer here, not sure how to find out
+ // if unsigned
+ return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
+ OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ Value *CastItem = Builder.CreateAlloca(ToType, nullptr, "cast_tmp");
+ Builder.restoreIP(CurIP);
+
+ Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ CastItem, FromType->getPointerTo(), "valcastitem");
+ Builder.CreateStore(From, ValCastItem);
+ return Builder.CreateLoad(ToType, CastItem, "castitemload");
+}
+
+static Value *
+createRuntimeShuffleFunction(Module &M, OpenMPIRBuilder &OMPBuilder,
+ const OpenMPIRBuilder::LocationDescription &Loc,
+ OpenMPIRBuilder::InsertPointTy AllocaIP,
+ Value *Element, Type *ElementType, Value *Offset) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ uint64_t Size =
+ divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+ assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
+ Function *ShuffleFunc = OMPBuilder.getOrCreateRuntimeFunctionPtr(
+ Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
+ : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
+ Type *IntType = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+ Value *ElemCast = Builder.CreateCast(Instruction::SExt, Element, IntType);
+ Value *WarpSize = getGPUWarpSize(M, OMPBuilder);
+ Value *WarpSizeCast =
+ Builder.CreateIntCast(WarpSize, Type::getInt16Ty(Ctx), /*isSigned=*/true);
+ Value *ShuffleCall =
+ Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
+ return castValueToType(M, OMPBuilder, ShuffleCall, IntType, AllocaIP, Loc);
+}
+
+static void shuffleAndStore(Value *SrcAddr, Value *DstAddr, Type *ElementType,
+ llvm::Value *Offset, Type *ReductionArrayTy,
+ const OpenMPIRBuilder::LocationDescription &Loc,
+ Module &M, OpenMPIRBuilder &OMPBuilder,
+ OpenMPIRBuilder::InsertPointTy AllocaIP) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ uint64_t Size =
+ divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ Value *ElemPtr = DstAddr;
+ Value *Ptr = SrcAddr;
+ // Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ // Builder.CreateConstGEP1_64(ReductionArrayTy, SrcAddr, 1), PtrTy);
+ for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
+ if (Size < IntSize)
+ continue;
+ // FIXME(JAN): Check if there is a function to convert from bytes to bits
+ Type *IntTy = Builder.getIntNTy(IntSize * 8);
+ Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ Ptr, IntTy->getPointerTo(), "ptrcast");
+ ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ElemPtr, IntTy->getPointerTo(), "elemptrcast");
+
+ // FIXME(JAN): Implement loop to handle larger size
+ assert(((Size / IntSize) <= 1) && "Unsupported IntSize");
+ Value *Val = Builder.CreateLoad(IntTy, Ptr);
+ Value *Res = createRuntimeShuffleFunction(M, OMPBuilder, Loc, AllocaIP, Val,
+ IntTy, Offset);
+ Builder.CreateStore(Res, ElemPtr);
+ Ptr = Builder.CreateConstGEP1_64(ReductionArrayTy, Ptr, 1, "ptrgep");
+ ElemPtr =
+ Builder.CreateConstGEP1_64(ReductionArrayTy, ElemPtr, 1, "elemptrgep");
+ Size = Size % IntSize;
+ }
+}
+
+static void
+emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ Value *SrcBase, Value *DestBase, Module &M,
+ OpenMPIRBuilder &OMPBuilder,
+ const OpenMPIRBuilder::LocationDescription &Loc,
+ OpenMPIRBuilder::InsertPointTy AllocaIP,
+ CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+
+ Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *SrcElementAddr = nullptr;
+ Value *DestElementAddr = nullptr;
+ Value *DestElementPtrAddr = nullptr;
+ bool ShuffleInElement = false;
+ bool UpdateDestListPtr = false;
+
+ // Step 1.1: Get the address for the src element in the Reduce list.
+ Value *SrcElementPtrAddr = Builder.CreateConstGEP2_64(
+ ReductionArrayTy, SrcBase, 0, En.index(), "srcelementptraddr");
+ SrcElementAddr =
+ Builder.CreateLoad(PtrTy, SrcElementPtrAddr, "srcelementaddr");
+
+ // Step 1.2: Create a temporary to store the element in the destination
+ // Reduce list.
+ DestElementPtrAddr = Builder.CreateInBoundsGEP(
+ ReductionArrayTy, DestBase,
+ {Builder.getInt64(0), Builder.getInt64(En.index())},
+ "destelementptraddr");
+ switch (Action) {
+ case RemoteLaneToThread: {
+ OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ DestElementAddr = Builder.CreateAlloca(RI.ElementType, nullptr,
+ ".omp.reduction.element");
+ Builder.restoreIP(CurIP);
+ ShuffleInElement = true;
+ UpdateDestListPtr = true;
+ break;
+ }
+ case ThreadCopy: {
+ DestElementAddr =
+ Builder.CreateLoad(PtrTy, DestElementPtrAddr, "destelementaddr");
+ break;
+ }
+ }
+
+ // FIXME(JAN): Original code in clanguses <Addr>.withElementType(...)
+ // check if this generates any code
+
+ if (ShuffleInElement) {
+ shuffleAndStore(SrcElementAddr, DestElementAddr, RI.ElementType,
+ RemoteLaneOffset, ReductionArrayTy, Loc, M, OMPBuilder,
+ AllocaIP);
+ } else {
+ // FIXME(JAN): Assume Scalar here (TEK_Scalar in Clang)
+ Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+ Builder.CreateStore(Elem, DestElementAddr);
+ }
+ // Step 3.1: Modify reference in dest Reduce list as needed.
+ // Modifying the reference in Reduce list to point to the newly
+ // created element. The element is live in the current function
+ // scope and that of functions it invokes (i.e., reduce_function).
+ // RemoteReduceData[i] = (void*)&RemoteElem
+ if (UpdateDestListPtr) {
+ Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ DestElementAddr, PtrTy, "castdestaddr");
+ Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
+ }
+ }
+}
+
+static OpenMPIRBuilder::InsertPointTy getIPAfterInstr(Instruction *I) {
+ BasicBlock::iterator it(I);
+ it++;
+ return OpenMPIRBuilder::InsertPointTy(I->getParent(), it);
+}
+
+static Function *emitShuffleAndReduceFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ Type *I16Type = Type::getInt16Ty(Ctx);
+ auto FuncTy = FunctionType::get(VoidTy, {PtrTy, I16Type, I16Type, I16Type},
+ /* IsVarArg */ false);
+ Function *SarFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_shuffle_and_reduce_func", &M);
+ SarFunc->setDoesNotRecurse();
+
+ // Set arg names
+ Argument *Arg0 = SarFunc->getArg(0);
+ Argument *Arg1 = SarFunc->getArg(1);
+ Argument *Arg2 = SarFunc->getArg(2);
+ Argument *Arg3 = SarFunc->getArg(3);
+ Arg0->setName("reduce_list_arg");
+ Arg1->setName("lane_id_arg");
+ Arg2->setName("remote_lane_offset_arg");
+ Arg3->setName("algo_ver_arg");
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", SarFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ Type *Arg0Type = Arg0->getType();
+ Type *ArgNType = Arg1->getType();
+ Type *ArgNPtrType = Arg1->getType()->getPointerTo();
+ Value *ReduceListAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Value *LaneIdAlloca =
+ Builder.CreateAlloca(ArgNType, nullptr, Arg1->getName() + ".addr");
+ Value *RemoteLaneOffsetAlloca =
+ Builder.CreateAlloca(ArgNType, nullptr, Arg2->getName() + ".addr");
+ Value *AlgoVerAlloca =
+ Builder.CreateAlloca(ArgNType, nullptr, Arg3->getName() + ".addr");
+ // FIXME(Jan): Compute reduction list array type
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+ RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+ Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+ Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LaneIdAlloca, ArgNPtrType, LaneIdAlloca->getName() + ".acast");
+ Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteLaneOffsetAlloca, ArgNPtrType,
+ RemoteLaneOffsetAlloca->getName() + ".acast");
+ Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ AlgoVerAlloca, ArgNPtrType, AlgoVerAlloca->getName() + ".acast");
+ Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteReductionListAlloca, PtrTy,
+ RemoteReductionListAlloca->getName() + ".acast");
+
+ Builder.CreateStore(Arg0, ReduceListAddrCast);
+ Builder.CreateStore(Arg1, LaneIdAddrCast);
+ Builder.CreateStore(Arg2, RemoteLaneOffsetAddrCast);
+ Builder.CreateStore(Arg3, AlgoVerAddrCast);
+
+ Value *ReduceList =
+ Builder.CreateLoad(Arg0Type, ReduceListAddrCast, "reduce_list");
+ Value *LaneId = Builder.CreateLoad(ArgNType, LaneIdAddrCast, "lane_id");
+ Value *RemoteLaneOffset = Builder.CreateLoad(
+ ArgNType, RemoteLaneOffsetAddrCast, "remote_lane_offset");
+ Value *AlgoVer = Builder.CreateLoad(ArgNType, AlgoVerAddrCast, "algo_ver");
+
+ OpenMPIRBuilder::InsertPointTy AllocaIP =
+ getIPAfterInstr(RemoteReductionListAlloca);
+ emitReductionListCopy(RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+ ReduceList, RemoteListAddrCast, M, OMPBuilder, Loc,
+ AllocaIP, {RemoteLaneOffset, nullptr, nullptr});
+
+ // The actions to be performed on the Remote Reduce list is dependent
+ // on the algorithm version.
+ //
+ // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+ // LaneId % 2 == 0 && Offset > 0):
+ // do the reduction value aggregation
+ //
+ // The thread local variable Reduce list is mutated in place to host the
+ // reduced data, which is the aggregated value produced from local and
+ // remote lanes.
+ //
+ // Note that AlgoVer is expected to be a constant integer known at compile
+ // time.
+ // When AlgoVer==0, the first conjunction evaluates to true, making
+ // the entire predicate true during compile time.
+ // When AlgoVer==1, the second conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ // When AlgoVer==2, the third conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+ Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+ Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+ Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+ Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+ Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+ Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+ Value *RemoteOffsetComp =
+ Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+ Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+ Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+ Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", SarFunc);
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", SarFunc);
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", SarFunc);
+
+ Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+ Builder.SetInsertPoint(ThenBB);
+ // reduce_function(LocalReduceList, RemoteReduceList)
+ Value *LocalReduceListPtr =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(ReduceList, PtrTy);
+ Value *RemoteReduceListPtr =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RemoteListAddrCast, PtrTy);
+ Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
+ Builder.CreateBr(MergeBB);
+ Builder.SetInsertPoint(ElseBB);
+ Builder.CreateBr(MergeBB);
+ Builder.SetInsertPoint(MergeBB);
+
+ Value *Algo1_2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+ Value *CondCopy = Builder.CreateAnd(Algo1_2, LaneIdGtOffset);
+
+ BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "cpy_then", SarFunc);
+ BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "cpy_else", SarFunc);
+ BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "cpy_ifcont", SarFunc);
+
+ Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+ Builder.SetInsertPoint(CpyThenBB);
+ emitReductionListCopy(ThreadCopy, RedListArrayTy, ReductionInfos,
+ RemoteListAddrCast, ReduceList, M, OMPBuilder, Loc,
+ AllocaIP);
+ Builder.CreateBr(CpyMergeBB);
+ Builder.SetInsertPoint(CpyElseBB);
+ Builder.CreateBr(CpyMergeBB);
+ Builder.SetInsertPoint(CpyMergeBB);
+ Builder.CreateRetVoid();
+
+ return SarFunc;
+}
+
+static Function *emitInterWarpCopyFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ Type *I32Type = Type::getInt32Ty(Ctx);
+ auto FuncTy =
+ FunctionType::get(VoidTy, {PtrTy, I32Type}, /* IsVarArg */ false);
+ Function *WcFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_inter_warp_copy_func", &M);
+ WcFunc->setDoesNotRecurse();
+
+ // Set arg names
+ Argument *Arg0 = WcFunc->getArg(0);
+ Argument *Arg1 = WcFunc->getArg(1);
+ Arg0->setName("reduce_list");
+ Arg1->setName("num_warps");
+
+ // Ensure data transfer storage
+ unsigned WarpSize = OMPBuilder.Config.getGridValue().GV_Warp_Size;
+ // FIXME(Jan): Not sure about the array type here, but it is I32 in Clang
+ auto *ArrayTy = ArrayType::get(I32Type, WarpSize);
+ StringRef TransferMediumName =
+ "__openmp_nvptx_data_transfer_temporary_storage";
+ GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+ if (!TransferMedium) {
+ unsigned SharedAddressSpace =
+ 3; /* FIXME(Jan): C.getTargetAddressSpace(LangAS::cuda_shared); */
+ TransferMedium = new GlobalVariable(
+ M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
+ UndefValue::get(ArrayTy), TransferMediumName,
+ /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+ SharedAddressSpace);
+ }
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", WcFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ Type *Arg0Type = Arg0->getType();
+ Type *Arg1Type = Arg1->getType();
+ Value *ReduceListAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Instruction *NumWarpsAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+ Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ NumWarpsAlloca, Arg1Type->getPointerTo(),
+ NumWarpsAlloca->getName() + ".acast");
+ Builder.CreateStore(Arg0, ReduceListAddrCast);
+ Builder.CreateStore(Arg1, NumWarpsAddrCast);
+
+ // Get GPU Info
+ Value *ThreadID = getGPUThreadID(M, OMPBuilder);
+ Value *LaneID = getNVPTXLaneID(M, OMPBuilder);
+ Value *WarpID = getNVPTXWarpID(M, OMPBuilder);
+
+ Value *ReduceListArg =
+ Builder.CreateLoad(PtrTy, ReduceListAddrCast, "reduce_list_arg");
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Type *ElementTy = RI.ElementType;
+ unsigned NumTypeBits = M.getDataLayout().getTypeSizeInBits(ElementTy);
+ unsigned RealTySize = divideCeil(NumTypeBits, 8);
+ for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+ unsigned NumIters = RealTySize / TySize;
+ if (NumIters == 0)
+ continue;
+ // Type *CopyTy = Builder.getIntNTy(TySize);
+ Type *Int32Ty = Builder.getInt32Ty();
+ Value *Cnt = nullptr;
+ Value *CntAddrAcast = nullptr;
+ BasicBlock *PrecondBB = nullptr;
+ BasicBlock *ExitBB = nullptr;
+
+ if (NumIters > 1) {
+ OpenMPIRBuilder::InsertPointTy CurrIP = Builder.saveIP();
+ Builder.SetInsertPoint(NumWarpsAlloca);
+ Value *CntAddr = Builder.CreateAlloca(Int32Ty, nullptr, ".cnt.addr");
+ Builder.restoreIP(CurrIP);
+ CntAddrAcast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ CntAddr, PtrTy, CntAddr->getName() + ".acast");
+ Builder.CreateStore(Constant::getNullValue(Int32Ty), CntAddrAcast);
+ PrecondBB = BasicBlock::Create(Ctx, "precond", WcFunc);
+ ExitBB = BasicBlock::Create(Ctx, "exit", WcFunc);
+ BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body", WcFunc);
+ Builder.CreateBr(PrecondBB);
+ Builder.SetInsertPoint(PrecondBB);
+ Cnt = Builder.CreateLoad(Int32Ty, CntAddrAcast, "cnt");
+ Value *Cmp = Builder.CreateICmpULT(Cnt, Builder.getInt32(NumIters));
+ Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
+ Builder.SetInsertPoint(BodyBB);
+ }
+
+ OMPBuilder.createBarrier(
+ OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true);
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", WcFunc);
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", WcFunc);
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", WcFunc);
+
+ // if (lane_id == 0)
+ Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
+ Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+
+ // then
+ // Reduce element = LocalReduceList[i]
+ Builder.SetInsertPoint(ThenBB);
+ // FIXME(JAN): Should array type be passed in?
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ // FIXME(JAN): maybe it should be 0,0 and not use En.index()
+ Value *ReduceListElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, ReduceListArg, 0, En.index());
+ Value *ReduceListElementPtr = Builder.CreateLoad(
+ PtrTy, ReduceListElementPtrPtr, "reduce_list_element_ptr");
+ if (NumIters > 1)
+ ReduceListElementPtr =
+ Builder.CreateGEP(Int32Ty, ReduceListElementPtr, Cnt);
+
+ Value *TransferElemAddr = Builder.CreateInBoundsGEP(
+ ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
+ Value *ReduceListElement = Builder.CreateLoad(
+ I32Type, ReduceListElementPtr, "reduce_list_element");
+ Builder.CreateStore(ReduceListElement, TransferElemAddr,
+ /*IsVolatile*/ true);
+ Builder.CreateBr(MergeBB);
+
+ // else
+ Builder.SetInsertPoint(ElseBB);
+ Builder.CreateBr(MergeBB);
+
+ // endif
+ Builder.SetInsertPoint(MergeBB);
+ OMPBuilder.createBarrier(
+ OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true);
+
+ // Warp 0 copies reduce element from transfer medium
+ BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "w0then", WcFunc);
+ BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "w0else", WcFunc);
+ BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "w0ifcont", WcFunc);
+
+ Value *NumWarpsVal =
+ Builder.CreateLoad(I32Type, NumWarpsAddrCast, "num_warps");
+ Value *IsActiveThread =
+ Builder.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+ Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+ // W0then
+ // SecMEdiumPtr = &medium[tid]
+ Builder.SetInsertPoint(W0ThenBB);
+ Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+ ArrayTy, TransferMedium, {Builder.getInt64(0), ThreadID});
+ // SrcMediumVal = *SrcMediumPtr
+ // TODO(JAN): Bitcast here, but no load? skipping for now
+ Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, ReduceListArg, 0, En.index());
+ Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+ if (NumIters > 1)
+ TargetElementPtr = Builder.CreateGEP(Int32Ty, TargetElementPtr, Cnt);
+
+ Value *SrcMediumValue =
+ Builder.CreateLoad(I32Type, SrcMediumPtrVal, /*IsVolatile*/ true);
+ Builder.CreateStore(SrcMediumValue, TargetElementPtr);
+ Builder.CreateBr(W0MergeBB);
+
+ // W0else
+ Builder.SetInsertPoint(W0ElseBB);
+ Builder.CreateBr(W0MergeBB);
+
+ // W0endif
+ Builder.SetInsertPoint(W0MergeBB);
+ if (NumIters > 1) {
+ Cnt = Builder.CreateNSWAdd(Cnt, Builder.getInt32(1));
+ Builder.CreateStore(Cnt, CntAddrAcast);
+ Builder.CreateBr(PrecondBB);
+ Builder.SetInsertPoint(ExitBB);
+ }
+ }
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return WcFunc;
+}
+
+/// This function emits a helper that copies all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+/// For all data entries D in reduce_data:
+/// Copy local D to buffer.D[Idx]
+static Function *emitListToGlobalCopyFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ auto FuncTy =
+ FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ Function *LtGCFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_list_to_global_copy_func", &M);
+ LtGCFunc->setDoesNotRecurse();
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Set arg names
+ Argument *Arg0 = LtGCFunc->getArg(0);
+ Argument *Arg1 = LtGCFunc->getArg(1);
+ Argument *Arg2 = LtGCFunc->getArg(2);
+ Arg0->setName("buffer_arg");
+ Arg1->setName("idx_arg");
+ Arg2->setName("reduce_list_arg");
+
+ Value *BufferArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+ Value *IdxArgAlloca =
+ Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+ Value *ReduceListArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+ // FIXME(JAN): Assume a single globalized variable for now, this should be
+ // passed in
+ Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+ Type *TypeArgs[] = {SingleReductionTy};
+ StructType *ReductionsBufferTy =
+ StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+ Builder.CreateStore(Arg0, BufferArgAddrCast);
+ Builder.CreateStore(Arg1, IdxArgAddrCast);
+ Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+ Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+ Value *Idxs[] = {
+ Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
+ Value *ReduceListArg =
+ Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
+ // FIXME(Jan): Assume TEK_SCALAR
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ // FIXME(Jan): Compute array type
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, ReduceListArg, 0, En.index());
+ Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index());
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, TargetElementPtr);
+ Builder.CreateStore(TargetElement, GlobValPtr);
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGCFunc;
+}
+
+/// This function emits a helper that copies all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+/// For all data entries D in reduce_data:
+/// Copy local D to buffer.D[Idx]
+static Function *emitGlobalToListCopyFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ auto FuncTy =
+ FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ Function *LtGCFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_global_to_list_copy_func", &M);
+ LtGCFunc->setDoesNotRecurse();
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Set arg names
+ Argument *Arg0 = LtGCFunc->getArg(0);
+ Argument *Arg1 = LtGCFunc->getArg(1);
+ Argument *Arg2 = LtGCFunc->getArg(2);
+ Arg0->setName("buffer_arg");
+ Arg1->setName("idx_arg");
+ Arg2->setName("reduce_list_arg");
+
+ Value *BufferArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+ Value *IdxArgAlloca =
+ Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+ Value *ReduceListArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+ // FIXME(JAN): Assume a single globalized variable for now, this should be
+ // passed in
+ Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+ Type *TypeArgs[] = {SingleReductionTy};
+ StructType *ReductionsBufferTy =
+ StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+ Builder.CreateStore(Arg0, BufferArgAddrCast);
+ Builder.CreateStore(Arg1, IdxArgAddrCast);
+ Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+ Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+ Value *Idxs[] = {
+ Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
+ Value *ReduceListArg =
+ Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
+ // FIXME(Jan): Assume TEK_SCALAR
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ // FIXME(Jan): Compute array type
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, ReduceListArg, 0, En.index());
+ Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index());
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+ Builder.CreateStore(TargetElement, TargetElementPtr);
+ }
+
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGCFunc;
+}
+
+/// This function emits a helper that reduces all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+/// void *GlobPtrs[];
+/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+/// ...
+/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+/// reduce_function(GlobPtrs, reduce_data);
+/// Create a function with a unique name and a "void (i8*, i8*)" signature in
+/// the given module and return it.
+static Function *emitListToGlobalReduceFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ auto FuncTy =
+ FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ Function *LtGRFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_list_to_global_reduce_func", &M);
+ LtGRFunc->setDoesNotRecurse();
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Set arg names
+ Argument *Arg0 = LtGRFunc->getArg(0);
+ Argument *Arg1 = LtGRFunc->getArg(1);
+ Argument *Arg2 = LtGRFunc->getArg(2);
+ Arg0->setName("buffer_arg");
+ Arg1->setName("idx_arg");
+ Arg2->setName("reduce_list_arg");
+
+ Value *BufferArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+ Value *IdxArgAlloca =
+ Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+ Value *ReduceListArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ // FIXME(Jan): Compute array type
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ Value *LocalReduceList =
+ Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+ Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
+ // FIXME(JAN): Assume a single globalized variable for now, this should be
+ // passed in
+ Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+ Type *TypeArgs[] = {SingleReductionTy};
+ StructType *ReductionsBufferTy =
+ StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+ Builder.CreateStore(Arg0, BufferArgAddrCast);
+ Builder.CreateStore(Arg1, IdxArgAddrCast);
+ Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+ Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+ Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
+ // FIXME(Jan): Assume TEK_SCALAR
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index());
+ Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+ }
+
+ Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList});
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGRFunc;
+}
+
+/// This function emits a helper that reduces all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+/// void *GlobPtrs[];
+/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+/// ...
+/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+/// reduce_function(GlobPtrs, reduce_data);
/// Create a function with a unique name and a "void (i8*, i8*)" signature in
/// the given module and return it.
-Function *getFreshReductionFunc(Module &M) {
+static Function *emitGlobalToListReduceFunction(
+ Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+ OpenMPIRBuilder &OMPBuilder) {
+ IRBuilder<> &Builder = OMPBuilder.Builder;
+ OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+ LLVMContext &Ctx = M.getContext();
+ Type *VoidTy = Type::getVoidTy(Ctx);
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ auto FuncTy =
+ FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ Function *LtGRFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_global_to_list_reduce_func", &M);
+ LtGRFunc->setDoesNotRecurse();
+
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+ Builder.SetInsertPoint(EntryBlock);
+
+ // Set arg names
+ Argument *Arg0 = LtGRFunc->getArg(0);
+ Argument *Arg1 = LtGRFunc->getArg(1);
+ Argument *Arg2 = LtGRFunc->getArg(2);
+ Arg0->setName("buffer_arg");
+ Arg1->setName("idx_arg");
+ Arg2->setName("reduce_list_arg");
+
+ Value *BufferArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+ Value *IdxArgAlloca =
+ Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+ Value *ReduceListArgAlloca =
+ Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ // FIXME(Jan): Compute array type
+ auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ Value *LocalReduceList =
+ Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+ Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+ Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
+ // FIXME(JAN): Assume a single globalized variable for now, this should be
+ // passed in
+ Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+ Type *TypeArgs[] = {SingleReductionTy};
+ StructType *ReductionsBufferTy =
+ StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+ Builder.CreateStore(Arg0, BufferArgAddrCast);
+ Builder.CreateStore(Arg1, IdxArgAddrCast);
+ Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+ Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+ Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
+ // FIXME(Jan): Assume TEK_SCALAR
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+ Value *BufferVD =
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index());
+ Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+ }
+
+ Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {ReduceList, LocalReduceListAddrCast});
+ Builder.CreateRetVoid();
+ Builder.restoreIP(OldIP);
+ return LtGRFunc;
+}
+
+static Function *getFreshReductionFunc(Module &M) {
Type *VoidTy = Type::getVoidTy(M.getContext());
Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
auto *FuncTy =
FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- M.getDataLayout().getDefaultGlobalsAddressSpace(),
".omp.reduction.func", &M);
}
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
- const LocationDescription &Loc, InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
- for (const ReductionInfo &RI : ReductionInfos) {
+static void populateReductionFunction(
+ Function *ReductionFunc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ IRBuilder<> &Builder, bool IsGPU) {
+ Module *Module = ReductionFunc->getParent();
+ BasicBlock *ReductionFuncBlock =
+ BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+ Builder.SetInsertPoint(ReductionFuncBlock);
+ Value *LHSArrayPtr = nullptr;
+ Value *RHSArrayPtr = nullptr;
+ if (IsGPU) {
+ // Need to alloca memory here and deal with the pointers before getting
+ // LHS/RHS pointers out
+ //
+ Argument *Arg0 = ReductionFunc->getArg(0);
+ Argument *Arg1 = ReductionFunc->getArg(1);
+ Type *Arg0Type = Arg0->getType();
+ Type *Arg1Type = Arg1->getType();
+
+ Value *LHSAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Value *RHSAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ Value *LHSAddrCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
+ Value *RHSAddrCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
+ Builder.CreateStore(Arg0, LHSAddrCast);
+ Builder.CreateStore(Arg1, RHSAddrCast);
+ LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+ RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+ } else {
+ LHSArrayPtr = ReductionFunc->getArg(0);
+ RHSArrayPtr = ReductionFunc->getArg(1);
+ }
+
+ unsigned NumReductions = ReductionInfos.size();
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, LHSArrayPtr, 0, En.index());
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType());
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, RHSArrayPtr, 0, En.index());
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType());
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+ if (!Builder.GetInsertBlock())
+ return;
+ Builder.CreateStore(Reduced, LHSPtr);
+ }
+ Builder.CreateRetVoid();
+}
+
+static void
+checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ bool IsGPU) {
+ for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
(void)RI;
assert(RI.Variable && "expected non-null variable");
assert(RI.PrivateVariable && "expected non-null private variable");
assert(RI.ReductionGen && "expected non-null reduction generator callback");
- assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
- "expected variables and their private equivalents to have the same "
- "type");
+ // JAN: Skip this assertion for GPU, address spaces are present
+ if (!IsGPU) {
+ assert(
+ RI.Variable->getType() == RI.PrivateVariable->getType() &&
+ "expected variables and their private equivalents to have the same "
+ "type");
+ }
assert(RI.Variable->getType()->isPointerTy() &&
"expected variables to be pointers");
}
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
+ bool IsTeamsReduction, bool HasDistribute) {
+ checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+ LLVMContext &Ctx = M.getContext();
+ if (!updateToLocation(Loc))
+ return InsertPointTy();
+
+ if (ReductionInfos.size() == 0)
+ return Builder.saveIP();
+
+ assert(ReductionInfos.size() == 1 && "More than one reduction variable");
+
+ // Copied code from createReductions
+ BasicBlock *InsertBlock = Loc.IP.getBlock();
+ BasicBlock *ContinuationBlock =
+ InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
+ InsertBlock->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+
+ Function *ReductionFunc = nullptr;
+ if (GLOBAL_ReductionFunc) {
+ ReductionFunc = GLOBAL_ReductionFunc;
+ } else {
+ ReductionFunc = getFreshReductionFunc(M);
+ GLOBAL_ReductionFunc = ReductionFunc;
+ InsertPointTy CurIP = Builder.saveIP();
+ populateReductionFunction(ReductionFunc, ReductionInfos, Builder, true);
+ Builder.restoreIP(CurIP);
+ }
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+ Value *RTLoc =
+ getOrCreateIdent(SrcLocStr, SrcLocStrSize, llvm::omp::IdentFlag(0), 0);
+
+ // 1. Build a list of reduction variables
+ auto Size = ReductionInfos.size();
+ // FIXME(JAN): skipping variably modified type storage for array size
+ Type *PtrTy = PointerType::getUnqual(Ctx);
+ Type *RedArrayTy = ArrayType::get(PtrTy, Size);
+ InsertPointTy CurIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ Value *ReductionListAlloca =
+ Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
+ Value *ReductionList =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionListAlloca, PtrTy);
+ Builder.restoreIP(CurIP);
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *ElemPtr = Builder.CreateConstGEP2_64(RedArrayTy, ReductionList, 0,
+ En.index(), "elem_ptr");
+ Value *CastElem =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+ Builder.CreateStore(CastElem, ElemPtr);
+ }
+ CurIP = Builder.saveIP();
+ Function *SarFunc = emitShuffleAndReduceFunction(M, Loc, ReductionInfos,
+ ReductionFunc, *this);
+ Function *WcFunc = emitInterWarpCopyFunction(M, Loc, ReductionInfos, *this);
+ Builder.restoreIP(CurIP);
+
+ Value *RL =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
+ Value *ReductionDataSize =
+ getTypeSizeInBytesValue(Builder, M, ReductionInfos.begin()->ElementType);
+
+ Value *Res;
+ if (!IsTeamsReduction) {
+ Value *SarFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+ Value *WcFuncCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+ Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+ Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+ Res = Builder.CreateCall(Pv2Ptr, Args);
+ } else {
+ CurIP = Builder.saveIP();
+ Function *LtGCFunc =
+ emitListToGlobalCopyFunction(M, Loc, ReductionInfos, *this);
+ Function *LtGRFunc = emitListToGlobalReduceFunction(M, Loc, ReductionInfos,
+ ReductionFunc, *this);
+ Function *GtLCFunc =
+ emitGlobalToListCopyFunction(M, Loc, ReductionInfos, *this);
+ Function *GtLRFunc = emitGlobalToListReduceFunction(M, Loc, ReductionInfos,
+ ReductionFunc, *this);
+ Builder.restoreIP(CurIP);
+
+ Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+
+ Value *KernelTeamsReductionPtr = Builder.CreateCall(RedFixedBuferFn, {});
+
+ Value *Args3[] = {RTLoc,
+ KernelTeamsReductionPtr,
+ Builder.getInt32(1024),
+ ReductionDataSize,
+ RL,
+ SarFunc,
+ WcFunc,
+ LtGCFunc,
+ LtGRFunc,
+ GtLCFunc,
+ GtLRFunc};
+
+ Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
+ RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
+ Res = Builder.CreateCall(TeamsReduceFn, Args3);
+ }
+
+ if (IsTeamsReduction || !HasDistribute) {
+ Function *CurFunc = Builder.GetInsertBlock()->getParent();
+ BasicBlock *ExitBB =
+ BasicBlock::Create(Ctx, ".omp.reduction.done", CurFunc);
+ BasicBlock *ThenBB =
+ BasicBlock::Create(Ctx, ".omp.reduction.then", CurFunc);
+ Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+ Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+
+ Builder.SetInsertPoint(ThenBB);
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *InputVal = Builder.CreateLoad(RI.ElementType, RI.Variable);
+ Value *RedVal = Builder.CreateLoad(
+ RI.ElementType, Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RI.PrivateVariable, PtrTy));
+ Value *sum;
+ Builder.restoreIP(
+ RI.ReductionGen(Builder.saveIP(), InputVal, RedVal, sum));
+ Builder.CreateStore(sum, RI.Variable);
+ Builder.CreateBr(ExitBB);
+ }
+ Builder.SetInsertPoint(ExitBB);
+ }
+ Builder.CreateBr(ContinuationBlock);
+ Builder.SetInsertPoint(ContinuationBlock);
+ return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef,
+ bool IsTeamsReduction, bool HasDistribute) {
+ if (Config.isGPU())
+ return createReductionsGPU(Loc, AllocaIP, ReductionInfos, IsNoWait,
+ IsTeamsReduction, HasDistribute);
+
+ checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
if (!updateToLocation(Loc))
return InsertPointTy();
+ if (ReductionInfos.size() == 0)
+ return Builder.saveIP();
+
BasicBlock *InsertBlock = Loc.IP.getBlock();
BasicBlock *ContinuationBlock =
InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -2275,6 +3441,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
}
Builder.CreateRetVoid();
+ populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
}
@@ -2787,10 +3954,24 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
- Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
- Type *TripCountTy = TripCount->getType();
+ Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
+ Value *TripCount = TripCountOrig;
+ // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
+ // not be the right way to fix it, but this works for now.
+ if (OMPBuilder->Config.isGPU()) {
+ if (LoopType != WorksharingLoopType::DistributeStaticLoop)
+ Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
+ LLVMContext &Ctx = M.getContext();
+ Type *IVTy = TripCountOrig->getType();
+ Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
+ ? Type::getInt32Ty(Ctx)
+ : Type::getInt64Ty(Ctx);
+ Constant *One = ConstantInt::get(InternalIVTy, 1);
+ TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
+ }
+ Type *TripCountTy = TripCount->getType();
FunctionCallee RTLFn =
getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
SmallVector<Value *, 8> RealArgs;
@@ -4551,6 +5732,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
Function *Kernel = Builder.GetInsertBlock()->getParent();
+ // Set the grid value in the config needed for lowering later on
+ Config.setGridValue(getGridValue(T, Config.TargetFeatures));
+
// Manifest the launch configuration in the metadata matching the kernel
// environment.
if (MinTeamsVal > 1 || MaxTeamsVal > 0)
@@ -4803,6 +5987,10 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
if (T.isAMDGCN())
OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+ if (!Config.TargetCPU.empty())
+ OutlinedFn->addFnAttr("target-cpu", Config.TargetCPU);
+ if (!Config.TargetFeatures.empty())
+ OutlinedFn->addFnAttr("target-features", Config.TargetFeatures);
}
}
>From 9149fc22ee8163a7b158deca79084e13de622d7b Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 31 Jan 2024 20:05:04 +0000
Subject: [PATCH 02/18] Clang migrate to using OMPBuilder.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1375 +-----------
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h | 3 -
...arallel_reduction_codegen_tbaa_PR46146.cpp | 546 ++---
.../OpenMP/nvptx_teams_reduction_codegen.cpp | 24 +-
.../target_teams_generic_loop_codegen.cpp | 12 +-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 623 +++++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1880 +++++++++--------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 11 +-
8 files changed, 1959 insertions(+), 2515 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 5baac8f0e3e268..93b46da1d55458 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
};
} // anonymous namespace
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
- unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
- "nvptx_lane_id");
-}
-
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
@@ -1430,1133 +1405,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
TBAAAccessInfo());
}
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
- llvm::Value *Elem,
- QualType ElemType,
- llvm::Value *Offset,
- SourceLocation Loc) {
- CodeGenModule &CGM = CGF.CGM;
- CGBuilderTy &Bld = CGF.Builder;
- CGOpenMPRuntimeGPU &RT =
- *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
- llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- assert(Size.getQuantity() <= 8 &&
- "Unsupported bitwidth in shuffle instruction.");
-
- RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
- ? OMPRTL___kmpc_shuffle_int32
- : OMPRTL___kmpc_shuffle_int64;
-
- // Cast all types to 32- or 64-bit values before calling shuffle routines.
- QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
- Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
- llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
- llvm::Value *WarpSize =
- Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
- llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
- {ElemCast, Offset, WarpSize});
-
- return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
- Address DestAddr, QualType ElemType,
- llvm::Value *Offset, SourceLocation Loc) {
- CGBuilderTy &Bld = CGF.Builder;
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- // Create the loop over the big sized data.
- // ptr = (void*)Elem;
- // ptrEnd = (void*) Elem + 1;
- // Step = 8;
- // while (ptr + Step < ptrEnd)
- // shuffle((int64_t)*ptr);
- // Step = 4;
- // while (ptr + Step < ptrEnd)
- // shuffle((int32_t)*ptr);
- // ...
- Address ElemPtr = DestAddr;
- Address Ptr = SrcAddr;
- Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
- if (Size < CharUnits::fromQuantity(IntSize))
- continue;
- QualType IntType = CGF.getContext().getIntTypeForBitwidth(
- CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
- /*Signed=*/1);
- llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
- Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
- IntTy);
- ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtr, IntTy->getPointerTo(), IntTy);
- if (Size.getQuantity() / IntSize > 1) {
- llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
- llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
- CGF.EmitBlock(PreCondBB);
- llvm::PHINode *PhiSrc =
- Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
- PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);
- llvm::PHINode *PhiDest =
- Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
- PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);
- Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
- ElemPtr =
- Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
- llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
- llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
- llvm::Value *PtrDiff = Bld.CreatePtrDiff(
- CGF.Int8Ty, PtrEndRaw,
- Bld.CreatePointerBitCastOrAddrSpaceCast(PtrRaw, CGF.VoidPtrTy));
- Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
- ThenBB, ExitBB);
- CGF.EmitBlock(ThenBB);
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
- Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);
- PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);
- CGF.EmitBranch(PreCondBB);
- CGF.EmitBlock(ExitBB);
- } else {
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Ptr = Bld.CreateConstGEP(Ptr, 1);
- ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- }
- Size = Size % IntSize;
- }
-}
-
-namespace {
-enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
- CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
- ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
- CodeGenModule &CGM = CGF.CGM;
- ASTContext &C = CGM.getContext();
- CGBuilderTy &Bld = CGF.Builder;
-
- llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
- // Iterates, element-by-element, through the source Reduce list and
- // make a copy.
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- Address SrcElementAddr = Address::invalid();
- Address DestElementAddr = Address::invalid();
- Address DestElementPtrAddr = Address::invalid();
- // Should we shuffle in an element from a remote lane?
- bool ShuffleInElement = false;
- // Set to true to update the pointer in the dest Reduce list to a
- // newly created element.
- bool UpdateDestListPtr = false;
- QualType PrivatePtrType = C.getPointerType(Private->getType());
- llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
- switch (Action) {
- case RemoteLaneToThread: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- ShuffleInElement = true;
- UpdateDestListPtr = true;
- break;
- }
- case ThreadCopy: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Get the address for dest element. The destination
- // element has already been created on the thread's stack.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr = CGF.EmitLoadOfPointer(
- DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
- break;
- }
- }
-
- // Regardless of src and dest of copy, we emit the load of src
- // element as this is required in all directions
- SrcElementAddr = SrcElementAddr.withElementType(
- CGF.ConvertTypeForMem(Private->getType()));
- DestElementAddr =
- DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
- // Now that all active lanes have read the element in the
- // Reduce list, shuffle over the value from the remote lane.
- if (ShuffleInElement) {
- shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
- } else {
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- SrcElementAddr, /*Volatile=*/false, Private->getType(),
- Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(
- Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getExprLoc());
- CGF.EmitStoreOfComplex(
- Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- }
-
- // Step 3.1: Modify reference in dest Reduce list as needed.
- // Modifying the reference in Reduce list to point to the newly
- // created element. The element is live in the current function
- // scope and that of functions it invokes (i.e., reduce_function).
- // RemoteReduceData[i] = (void*)&RemoteElem
- if (UpdateDestListPtr) {
- CGF.EmitStoreOfScalar(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),
- DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);
- }
-
- ++Idx;
- }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-/// shared smem[warp_size];
-/// For all data entries D in reduce_data:
-/// sync
-/// If (I am the first lane in each warp)
-/// Copy my local D to smem[warp_id]
-/// sync
-/// if (I am the first warp)
-/// Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
- ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy,
- SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- llvm::Module &M = CGM.getModule();
-
- // ReduceList: thread local Reduce list.
- // At the stage of the computation when this function is called, partially
- // aggregated values reside in the first lane of every active warp.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // NumWarps: number of warps active in the parallel region. This could
- // be smaller than 32 (max warps in a CTA) for partial block reduction.
- ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.getIntTypeForBitwidth(32, /* Signed */ true),
- ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&NumWarpsArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
- llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_inter_warp_copy_func", &M);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- // This array is used as a medium to transfer, one reduce element at a time,
- // the data from the first lane of every warp to lanes in the first warp
- // in order to perform the final step of a reduction in a parallel region
- // (reduction across warps). The array is placed in NVPTX __shared__ memory
- // for reduced latency, as well as to have a distinct copy for concurrently
- // executing target regions. The array is declared with common linkage so
- // as to be shared across compilation units.
- StringRef TransferMediumName =
- "__openmp_nvptx_data_transfer_temporary_storage";
- llvm::GlobalVariable *TransferMedium =
- M.getGlobalVariable(TransferMediumName);
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
- llvm::UndefValue::get(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
- CGM.addCompilerUsedGlobal(TransferMedium);
- }
-
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // nvptx_lane_id = nvptx_id % warpsize
- llvm::Value *LaneID = getNVPTXLaneID(CGF);
- // nvptx_warp_id = nvptx_id / warpsize
- llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- //
- // Warp master copies reduce element to transfer medium in __shared__
- // memory.
- //
- unsigned RealTySize =
- C.getTypeSizeInChars(Private->getType())
- .alignTo(C.getTypeAlignInChars(Private->getType()))
- .getQuantity();
- for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
- unsigned NumIters = RealTySize / TySize;
- if (NumIters == 0)
- continue;
- QualType CType = C.getIntTypeForBitwidth(
- C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
- llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
- CharUnits Align = CharUnits::fromQuantity(TySize);
- llvm::Value *Cnt = nullptr;
- Address CntAddr = Address::invalid();
- llvm::BasicBlock *PrecondBB = nullptr;
- llvm::BasicBlock *ExitBB = nullptr;
- if (NumIters > 1) {
- CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
- /*Volatile=*/false, C.IntTy);
- PrecondBB = CGF.createBasicBlock("precond");
- ExitBB = CGF.createBasicBlock("exit");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(PrecondBB);
- Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
- llvm::Value *Cmp =
- Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
- Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
- CGF.EmitBlock(BodyBB);
- }
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- Address ElemPtr(ElemPtrPtr, CopyType, Align);
- if (NumIters > 1)
- ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
- // Casting to actual data type.
- // MediumPtr = (CopyType*)MediumPtrAddr;
- Address MediumPtr(MediumPtrVal, CopyType, Align);
-
- // elem = *elemptr
- //*MediumPtr = elem
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, CType, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
-
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(MergeBB);
-
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
-
- //
- // Warp 0 copies reduce element from transfer medium.
- //
- llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
-
- // Up to 32 threads in warp 0 are active.
- llvm::Value *IsActiveThread =
- Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
- Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
- CGF.EmitBlock(W0ThenBB);
-
- // SrcMediumPtr = &medium[tid]
- llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
- // SrcMediumVal = *SrcMediumPtr;
- Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
-
- // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
- Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
- TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
- Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
- if (NumIters > 1)
- TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);
-
- // *TargetElemPtr = SrcMediumVal;
- llvm::Value *SrcMediumValue =
- CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
- CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- CType);
- Bld.CreateBr(W0MergeBB);
-
- CGF.EmitBlock(W0ElseBB);
- Bld.CreateBr(W0MergeBB);
-
- CGF.EmitBlock(W0MergeBB);
-
- if (NumIters > 1) {
- Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
- CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
- CGF.EmitBranch(PrecondBB);
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB);
- }
- RealTySize %= TySize;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// Emit a helper that reduces data across two OpenMP threads (lanes)
-/// in the same warp. It uses shuffle instructions to copy over data from
-/// a remote lane's stack. The reduction algorithm performed is specified
-/// by the fourth parameter.
-///
-/// Algorithm Versions.
-/// Full Warp Reduce (argument value 0):
-/// This algorithm assumes that all 32 lanes are active and gathers
-/// data from these 32 lanes, producing a single resultant value.
-/// Contiguous Partial Warp Reduce (argument value 1):
-/// This algorithm assumes that only a *contiguous* subset of lanes
-/// are active. This happens for the last warp in a parallel region
-/// when the user specified num_threads is not an integer multiple of
-/// 32. This contiguous subset always starts with the zeroth lane.
-/// Partial Warp Reduce (argument value 2):
-/// This algorithm gathers data from any number of lanes at any position.
-/// All reduced values are stored in the lowest possible lane. The set
-/// of problems every algorithm addresses is a super set of those
-/// addressable by algorithms with a lower version number. Overhead
-/// increases as algorithm version increases.
-///
-/// Terminology
-/// Reduce element:
-/// Reduce element refers to the individual data field with primitive
-/// data types to be combined and reduced across threads.
-/// Reduce list:
-/// Reduce list refers to a collection of local, thread-private
-/// reduce elements.
-/// Remote Reduce list:
-/// Remote Reduce list refers to a collection of remote (relative to
-/// the current thread) reduce elements.
-///
-/// We distinguish between three states of threads that are important to
-/// the implementation of this function.
-/// Alive threads:
-/// Threads in a warp executing the SIMT instruction, as distinguished from
-/// threads that are inactive due to divergent control flow.
-/// Active threads:
-/// The minimal set of threads that has to be alive upon entry to this
-/// function. The computation is correct iff active threads are alive.
-/// Some threads are alive but they are not active because they do not
-/// contribute to the computation in any useful manner. Turning them off
-/// may introduce control flow overheads without any tangible benefits.
-/// Effective threads:
-/// In order to comply with the argument requirements of the shuffle
-/// function, we must keep all lanes holding data alive. But at most
-/// half of them perform value aggregation; we refer to this half of
-/// threads as effective. The other half is simply handing off their
-/// data.
-///
-/// Procedure
-/// Value shuffle:
-/// In this step active threads transfer data from higher lane positions
-/// in the warp to lower lane positions, creating Remote Reduce list.
-/// Value aggregation:
-/// In this step, effective threads combine their thread local Reduce list
-/// with Remote Reduce list and store the result in the thread local
-/// Reduce list.
-/// Value copy:
-/// In this step, we deal with the assumption made by algorithm 2
-/// (i.e. contiguity assumption). When we have an odd number of lanes
-/// active, say 2k+1, only k threads will be effective and therefore k
-/// new values will be produced. However, the Reduce list owned by the
-/// (2k+1)th thread is ignored in the value aggregation. Therefore
-/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
-/// that the contiguity assumption still holds.
-static llvm::Function *emitShuffleAndReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
-
- // Thread local Reduce list used to host the values of data to be reduced.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Current lane id; could be logical.
- ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
- ImplicitParamKind::Other);
- // Offset of the remote source lane relative to the current lane.
- ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamKind::Other);
- // Algorithm version. This is expected to be known at compile time.
- ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&LaneIDArg);
- Args.push_back(&RemoteLaneOffsetArg);
- Args.push_back(&AlgoVerArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
-
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, SourceLocation()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
- llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
- AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
- llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
- AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
- llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
- AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- // Create a local thread-private variable to host the Reduce list
- // from a remote lane.
- Address RemoteReduceList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
-
- // This loop iterates through the list of reduce elements and copies,
- // element by element, from a remote lane in the warp to RemoteReduceList,
- // hosted on the thread's stack.
- emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
- LocalReduceList, RemoteReduceList,
- {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
- /*ScratchpadIndex=*/nullptr,
- /*ScratchpadWidth=*/nullptr});
-
- // The actions to be performed on the Remote Reduce list is dependent
- // on the algorithm version.
- //
- // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
- // LaneId % 2 == 0 && Offset > 0):
- // do the reduction value aggregation
- //
- // The thread local variable Reduce list is mutated in place to host the
- // reduced data, which is the aggregated value produced from local and
- // remote lanes.
- //
- // Note that AlgoVer is expected to be a constant integer known at compile
- // time.
- // When AlgoVer==0, the first conjunction evaluates to true, making
- // the entire predicate true during compile time.
- // When AlgoVer==1, the second conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- // When AlgoVer==2, the third conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
-
- llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondAlgo1 = Bld.CreateAnd(
- Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
- llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
- llvm::Value *CondAlgo2 = Bld.CreateAnd(
- Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
- CondAlgo2 = Bld.CreateAnd(
- CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
-
- llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
- CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
-
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
-
- CGF.EmitBlock(ThenBB);
- // reduce_function(LocalReduceList, RemoteReduceList)
- llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
- llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(MergeBB);
-
- // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
- // Reduce list.
- Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondCopy = Bld.CreateAnd(
- Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
- llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
- CGF.EmitBlock(CpyThenBB);
- emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
- RemoteReduceList, LocalReduceList);
- Bld.CreateBr(CpyMergeBB);
-
- CGF.EmitBlock(CpyElseBB);
- Bld.CreateBr(CpyMergeBB);
-
- CGF.EmitBlock(CpyMergeBB);
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy local D to buffer.D[Idx]
-static llvm::Value *emitListToGlobalCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemTy = CGF.ConvertTypeForMem(Private->getType());
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, ElemTy->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
- CGF.ConvertTypeForMem(Private->getType()),
- GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- CGF.EmitStoreOfScalar(V, GlobLVal);
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
- CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(GlobLVal,
- CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(GlobPtrs, reduce_data);
-static llvm::Value *emitListToGlobalReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- RawAddress ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
- /*Volatile=*/false, C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- // Call reduce_function(GlobalReduceList, ReduceList)
- llvm::Value *GlobalReduceList = ReductionList.getPointer();
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy buffer.D[Idx] to local D;
-static llvm::Value *emitGlobalToListCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemTy = CGF.ConvertTypeForMem(Private->getType());
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, ElemTy->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
- CGF.ConvertTypeForMem(Private->getType()),
- GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
- CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
- CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- GlobLVal, Private->getType(),
- AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(reduce_data, GlobPtrs);
-static llvm::Value *emitGlobalToListReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
- /*Volatile=*/false, C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- // Call reduce_function(ReduceList, GlobalReduceList)
- llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
- CGF.FinishFunction();
- return Fn;
-}
-
///
/// Design of OpenMP reductions on the GPU
///
@@ -2807,10 +1655,13 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
+ bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
#ifndef NDEBUG
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
#endif
+ ASTContext &C = CGM.getContext();
+
if (Options.SimpleReduction) {
assert(!TeamsReduction && !ParallelReduction &&
"Invalid reduction selection in emitReduction.");
@@ -2819,155 +1670,78 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
}
- assert((TeamsReduction || ParallelReduction) &&
- "Invalid reduction selection in emitReduction.");
-
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
- llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
- int Cnt = 0;
- for (const Expr *DRE : Privates) {
- PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
- ++Cnt;
- }
-
- ASTContext &C = CGM.getContext();
- const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
- CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
-
- // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
- // RedList, shuffle_reduce_func, interwarp_copy_func);
- // or
- // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+ // Source location for theident struct
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *Res;
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- auto Size = RHSExprs.size();
- for (const Expr *E : Privates) {
- if (E->getType()->isVariablyModifiedType())
- // Reserve place for array size.
- ++Size;
- }
- llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
- QualType ReductionArrayTy = C.getConstantArrayType(
- C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
- /*IndexTypeQuals=*/0);
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- unsigned Idx = 0;
- for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- CGF.Builder.CreateStore(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
- Elem);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
- llvm::Function *ReductionFn = emitReductionFunction(
- CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
- Privates, LHSExprs, RHSExprs, ReductionOps);
- llvm::Value *ReductionDataSize =
- CGF.getTypeSize(C.getRecordType(ReductionRec));
- ReductionDataSize =
- CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
- llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
- CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
- llvm::Value *InterWarpCopyFn =
- emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
- if (ParallelReduction) {
- llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
- InterWarpCopyFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
- Args);
- } else {
- assert(TeamsReduction && "expected teams reduction.");
- TeamsReductions.push_back(ReductionRec);
- auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
- {}, "_openmp_teams_reductions_buffer_$_$ptr");
- llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
-
- llvm::Value *Args[] = {
- RTLoc,
- KernelTeamsReductionPtr,
- CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
- ReductionDataSize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn,
- GlobalToBufferCpyFn,
- GlobalToBufferRedFn,
- BufferToGlobalCpyFn,
- BufferToGlobalRedFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
- Args);
- }
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+ CGF.AllocaInsertPt->getIterator());
+ InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+ llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
- // 5. Build if (res == 1)
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
- llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
- Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
- CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
- // 6. Build then branch: where we have reduced values in the master
- // thread in each team.
- // __kmpc_end_reduce{_nowait}(<gtid>);
- // break;
- CGF.EmitBlock(ThenBB);
-
- // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
- auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
- this](CodeGenFunction &CGF, PrePostActionTy &Action) {
- auto IPriv = Privates.begin();
- auto ILHS = LHSExprs.begin();
- auto IRHS = RHSExprs.begin();
- for (const Expr *E : ReductionOps) {
- emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
- cast<DeclRefExpr>(*IRHS));
- ++IPriv;
- ++ILHS;
- ++IRHS;
+ CodeGenFunction::OMPPrivateScope Scope(CGF);
+ unsigned Idx = 0;
+ for (const Expr *Private : Privates) {
+ llvm::Type *ElementType;
+ llvm::Value *Variable;
+ llvm::Value *PrivateVariable;
+ llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+ ElementType = CGF.ConvertTypeForMem(Private->getType());
+ const auto *RHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+ PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getPointer();
+ const auto *LHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+ Variable = CGF.GetAddrOfLocalVar(LHSVar).getPointer();
+ llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+ switch (CGF.getEvaluationKind(Private->getType())) {
+ case TEK_Scalar:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+ break;
+ case TEK_Complex:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+ break;
+ case TEK_Aggregate:
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+ break;
}
- };
- RegionCodeGenTy RCG(CodeGen);
- RCG(CGF);
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+ auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+ llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+ llvm::Function *NewFunc) {
+ CGF.Builder.restoreIP(CodeGenIP);
+ auto *CurFn = CGF.CurFn;
+ CGF.CurFn = NewFunc;
+
+ *LHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+ .getPointer();
+ *RHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+ .getPointer();
+
+ emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+ cast<DeclRefExpr>(LHSExprs[I]),
+ cast<DeclRefExpr>(RHSExprs[I]));
+
+ CGF.CurFn = CurFn;
+
+ return InsertPointTy(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ };
+ ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+ ElementType, Variable, PrivateVariable, EvalKind,
+ /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+ Idx++;
+ }
+
+ CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+ OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+ DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+ CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+ RTLoc));
+ return;
}
const VarDecl *
@@ -3562,10 +2336,3 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
Args);
}
-
-llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
- ArrayRef<llvm::Value *> Args{};
- return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_get_warp_size),
- Args);
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 141436f26230dd..4d586ec972f8d6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -150,9 +150,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
CodeGenFunction &CGF,
const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) override;
- /// Get the GPU warp size.
- llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);
-
/// Get the id of the current thread on the GPU.
llvm::Value *getGPUThreadID(CodeGenFunction &CGF);
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index fac42732022ad8..567598250301fd 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -36,14 +36,14 @@ void test() {
// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14:![0-9]+]]
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
// CHECK1-NEXT: call void @__kmpc_target_deinit()
// CHECK1-NEXT: ret void
@@ -66,78 +66,78 @@ void test() {
// CHECK1-NEXT: [[REF_TMP:%.*]] = alloca float, align 4
// CHECK1-NEXT: [[REF_TMP2:%.*]] = alloca float, align 4
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
-// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
// CHECK1: omp.inner.for.cond.cleanup:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[IB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]]
+// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP2]], align 4
// CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR12:[0-9]+]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT: store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT: store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[MUL5]], ptr [[IEND]], align 4
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT: store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[ISTART]], ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT: store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[IEND]], ptr [[TMP11]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK1: omp.inner.for.end:
// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
@@ -161,9 +161,9 @@ void test() {
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -197,79 +197,79 @@ void test() {
// CHECK1-NEXT: [[REF_TMP15:%.*]] = alloca float, align 4
// CHECK1-NEXT: [[REF_TMP16:%.*]] = alloca float, align 4
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 1
// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 1
// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[I]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
// CHECK1: omp.precond.then:
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT: store float 0.000000e+00, ptr [[REF_TMP6]], align 4
// CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR12]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
// CHECK1-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]]
// CHECK1: omp.dispatch.cond:
-// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
-// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
-// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP19]], 1
// CHECK1-NEXT: [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
// CHECK1-NEXT: br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -278,28 +278,28 @@ void test() {
// CHECK1: omp.dispatch.body:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[ADD11:%.*]] = add i32 [[TMP21]], 1
// CHECK1-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
// CHECK1: omp.inner.for.cond.cleanup:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP23]], 1
// CHECK1-NEXT: [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD13]], ptr [[I7]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT: store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT: store float [[CONV]], ptr [[REF_TMP15]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
// CHECK1-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT: store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT: store float [[CONV17]], ptr [[REF_TMP16]], align 4
// CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR12]]
// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR12]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -309,25 +309,25 @@ void test() {
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK1: omp.inner.for.end:
// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]]
// CHECK1: omp.dispatch.inc:
-// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]]
// CHECK1: omp.dispatch.end:
// CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -358,53 +358,53 @@ void test() {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__C]], ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR12]]
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4
// CHECK1-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT: store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store float [[ADD]], ptr [[__RE_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR12]]
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4
// CHECK1-NEXT: [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT: store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT: store float [[ADD3]], ptr [[__IM_]], align 4
// CHECK1-NEXT: ret ptr [[THIS1]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4
-// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21:![0-9]+]]
-// CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr %"class.std::complex", ptr [[TMP9]], i64 1
-// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
// CHECK1-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
// CHECK1-NEXT: [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
-// CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i64 1
// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
-// CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
// CHECK1-NEXT: [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
// CHECK1-NEXT: [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -433,7 +433,7 @@ void test() {
// CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
// CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
-// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP36]], ptr align 4 [[TMP34]], i64 8, i1 false), !tbaa.struct [[TBAA_STRUCT23:![0-9]+]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP36]], ptr align 8 [[TMP34]], i64 8, i1 false)
// CHECK1-NEXT: br label [[IFCONT6:%.*]]
// CHECK1: else5:
// CHECK1-NEXT: br label [[IFCONT6]]
@@ -442,24 +442,24 @@ void test() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: br label [[PRECOND:%.*]]
// CHECK1: precond:
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
// CHECK1-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
// CHECK1: body:
@@ -468,7 +468,7 @@ void test() {
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
// CHECK1: then:
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -478,45 +478,45 @@ void test() {
// CHECK1-NEXT: br label [[IFCONT]]
// CHECK1: ifcont:
// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
// CHECK1: then2:
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
// CHECK1: else3:
// CHECK1-NEXT: br label [[IFCONT4]]
// CHECK1: ifcont4:
// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: br label [[PRECOND]]
// CHECK1: exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
// CHECK1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
// CHECK1-NEXT: ret void
//
@@ -527,14 +527,14 @@ void test() {
// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
// CHECK1-NEXT: call void @__kmpc_target_deinit()
// CHECK1-NEXT: ret void
@@ -557,78 +557,78 @@ void test() {
// CHECK1-NEXT: [[REF_TMP:%.*]] = alloca double, align 8
// CHECK1-NEXT: [[REF_TMP2:%.*]] = alloca double, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
-// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
// CHECK1: omp.inner.for.cond.cleanup:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[IB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24:![0-9]+]]
+// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP]], align 8
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP2]], align 8
// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR12]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT: store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT: store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[MUL5]], ptr [[IEND]], align 4
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT: store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[ISTART]], ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT: store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[IEND]], ptr [[TMP11]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK1: omp.inner.for.end:
// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
@@ -652,9 +652,9 @@ void test() {
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -688,79 +688,79 @@ void test() {
// CHECK1-NEXT: [[REF_TMP15:%.*]] = alloca double, align 8
// CHECK1-NEXT: [[REF_TMP16:%.*]] = alloca double, align 8
// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], 1
// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], 1
// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[I]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
// CHECK1: omp.precond.then:
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP]], align 8
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT: store double 0.000000e+00, ptr [[REF_TMP6]], align 8
// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR12]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
// CHECK1-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]]
// CHECK1: omp.dispatch.cond:
-// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
-// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
-// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP19]], 1
// CHECK1-NEXT: [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
// CHECK1-NEXT: br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -769,28 +769,28 @@ void test() {
// CHECK1: omp.dispatch.body:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: [[ADD11:%.*]] = add i32 [[TMP21]], 1
// CHECK1-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
// CHECK1: omp.inner.for.cond.cleanup:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP23]], 1
// CHECK1-NEXT: [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD13]], ptr [[I7]], align 4
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT: store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT: store double [[CONV]], ptr [[REF_TMP15]], align 8
// CHECK1-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
// CHECK1-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT: store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT: store double [[CONV17]], ptr [[REF_TMP16]], align 8
// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR12]]
// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR12]]
// CHECK1-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -800,25 +800,25 @@ void test() {
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK1: omp.inner.for.end:
// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]]
// CHECK1: omp.dispatch.inc:
-// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]]
// CHECK1: omp.dispatch.end:
// CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -849,26 +849,26 @@ void test() {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__C]], ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR12]]
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT: store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store double [[ADD]], ptr [[__RE_]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
// CHECK1-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR12]]
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28:![0-9]+]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8
// CHECK1-NEXT: [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT: store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT: store double [[ADD3]], ptr [[__IM_]], align 8
// CHECK1-NEXT: ret ptr [[THIS1]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
@@ -876,14 +876,14 @@ void test() {
// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8
-// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
@@ -908,7 +908,7 @@ void test() {
// CHECK1-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
// CHECK1-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
// CHECK1: .shuffle.exit:
-// CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
// CHECK1-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
// CHECK1-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -937,7 +937,7 @@ void test() {
// CHECK1-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
// CHECK1-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
-// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT29:![0-9]+]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false)
// CHECK1-NEXT: br label [[IFCONT6:%.*]]
// CHECK1: else5:
// CHECK1-NEXT: br label [[IFCONT6]]
@@ -946,24 +946,24 @@ void test() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: br label [[PRECOND:%.*]]
// CHECK1: precond:
-// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 4
// CHECK1-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
// CHECK1: body:
@@ -972,7 +972,7 @@ void test() {
// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
// CHECK1: then:
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -982,45 +982,45 @@ void test() {
// CHECK1-NEXT: br label [[IFCONT]]
// CHECK1: ifcont:
// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
// CHECK1: then2:
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
// CHECK1-NEXT: br label [[IFCONT4:%.*]]
// CHECK1: else3:
// CHECK1-NEXT: br label [[IFCONT4]]
// CHECK1: ifcont4:
// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
// CHECK1-NEXT: br label [[PRECOND]]
// CHECK1: exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
// CHECK1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
// CHECK1-NEXT: ret void
//
@@ -1031,18 +1031,18 @@ void test() {
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT: store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store float [[TMP1]], ptr [[__RE_]], align 4
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT: store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+// CHECK1-NEXT: store float [[TMP3]], ptr [[__IM_]], align 4
// CHECK1-NEXT: ret void
//
//
@@ -1050,10 +1050,10 @@ void test() {
// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4
// CHECK1-NEXT: ret float [[TMP0]]
//
//
@@ -1061,10 +1061,10 @@ void test() {
// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4
// CHECK1-NEXT: ret float [[TMP0]]
//
//
@@ -1074,18 +1074,18 @@ void test() {
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT: store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT: store double [[TMP1]], ptr [[__RE_]], align 8
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT: [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT: store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8
+// CHECK1-NEXT: store double [[TMP3]], ptr [[__IM_]], align 8
// CHECK1-NEXT: ret void
//
//
@@ -1093,10 +1093,10 @@ void test() {
// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8
// CHECK1-NEXT: ret double [[TMP0]]
//
//
@@ -1104,9 +1104,9 @@ void test() {
// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8
// CHECK1-NEXT: ret double [[TMP0]]
//
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index 360a780c75383c..e7beb174872321 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -579,7 +579,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
// CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK1-NEXT: store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK1-NEXT: store i8 [[TMP9]], ptr [[C]], align 1
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -630,7 +630,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 1
// CHECK1-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
@@ -1156,7 +1156,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK1-NEXT: store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK1-NEXT: store i16 [[TMP13]], ptr [[B]], align 2
// CHECK1-NEXT: ret void
//
//
@@ -1207,7 +1207,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 2
// CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK1-NEXT: ret void
//
@@ -1766,7 +1766,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
// CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK2-NEXT: store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK2-NEXT: store i8 [[TMP9]], ptr [[C]], align 1
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -1817,7 +1817,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 1
// CHECK2-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -2343,7 +2343,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK2-NEXT: store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK2-NEXT: store i16 [[TMP13]], ptr [[B]], align 2
// CHECK2-NEXT: ret void
//
//
@@ -2394,7 +2394,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 2
// CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK2-NEXT: ret void
//
@@ -2953,7 +2953,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
// CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK3-NEXT: store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK3-NEXT: store i8 [[TMP9]], ptr [[C]], align 1
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -3004,7 +3004,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[C]], align 1
// CHECK3-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -3530,7 +3530,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK3-NEXT: store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK3-NEXT: store i16 [[TMP13]], ptr [[B]], align 2
// CHECK3-NEXT: ret void
//
//
@@ -3581,7 +3581,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B]], align 2
// CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK3-NEXT: ret void
//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 22cf534bf0ba27..95f69ef430eaa0 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1197,6 +1197,7 @@ int foo() {
// IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
@@ -1214,7 +1215,6 @@ int foo() {
// IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
@@ -1370,6 +1370,7 @@ int foo() {
// IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
// IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
@@ -1387,7 +1388,6 @@ int foo() {
// IR-GPU-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
@@ -1511,12 +1511,12 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -1592,10 +1592,10 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1656,12 +1656,12 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -1737,10 +1737,10 @@ int foo() {
// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e1dba2339338a3..4f2207cc700fbf 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -100,6 +100,9 @@ class OpenMPIRBuilderConfig {
/// expanded.
std::optional<bool> IsGPU;
+ /// Flag for specifying if LLVMUsed information should be emitted.
+ std::optional<bool> EmitLLVMUsed;
+
/// Flag for specifying if offloading is mandatory.
std::optional<bool> OpenMPOffloadMandatory;
@@ -178,6 +181,7 @@ class OpenMPIRBuilderConfig {
void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
void setIsGPU(bool Value) { IsGPU = Value; }
+ void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsed = Value; }
void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
void setSeparator(StringRef S) { Separator = S; }
@@ -622,15 +626,17 @@ class OpenMPIRBuilder {
/// Generator for '#omp barrier'
///
/// \param Loc The location where the barrier directive was encountered.
- /// \param DK The kind of directive that caused the barrier.
+ /// \param Kind The kind of directive that caused the barrier.
/// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
/// \param CheckCancelFlag Flag to indicate a cancel barrier return value
/// should be checked and acted upon.
+ /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
///
/// \returns The insertion point after the barrier.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
bool ForceSimpleCall = false,
- bool CheckCancelFlag = true);
+ bool CheckCancelFlag = true,
+ Value *ThreadID = nullptr);
/// Generator for '#omp cancel'
///
@@ -1250,39 +1256,56 @@ class OpenMPIRBuilder {
getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
StringRef ParentName = "");
- // using ReductionGenTy =
- // function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+ /// Enum class for the RedctionGen CallBack type to be used.
+ enum class ReductionGenCBTy { Clang, MLIR };
- // using AtomicReductionGenTy =
- // function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+ /// ReductionGen CallBack for Clang
+ ///
+ /// \param CodeGenIP InsertPoint for CodeGen.
+ /// \param Index Index of the ReductionInfo to generate code for.
+ /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
+ /// codegen, used for fixup later.
+ /// \param RHSPtr Optionally used by Clang to
+ /// return the RHSPtr it used for codegen, used for fixup later.
+ /// \param CurFn Optionally used by Clang to pass in the Current Function as
+ /// Clang context may be old.
+ using ReductionGenCBClang =
+ std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
+ Value **LHS, Value **RHS, Function *CurFn)>;
+
+ /// ReductionGen CallBack for MLIR
+ ///
+ /// \param CodeGenIP InsertPoint for CodeGen.
+ /// \param LHS Pass in the LHS Value to be used for CodeGen.
+ /// \param RHS Pass in the RHS Value to be used for CodeGen.
+ using ReductionGenCB = std::function<InsertPointTy(
+ InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
- /// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used
- /// to
- /// store lambdas with capture.
- /// Functions used to generate reductions. Such functions take two Values
- /// representing LHS and RHS of the reduction, respectively, and a reference
- /// to the value that is updated to refer to the reduction result.
- using ReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
- OpenMPIRBuilder::InsertPointTy, Value *, Value *, Value *&)>;
/// Functions used to generate atomic reductions. Such functions take two
/// Values representing pointers to LHS and RHS of the reduction, as well as
/// the element type of these pointers. They are expected to atomically
/// update the LHS to the reduced value.
- using AtomicReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
- OpenMPIRBuilder::InsertPointTy, Type *, Value *, Value *)>;
+ using AtomicReductionGenCB =
+ std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+ /// Enum class for reduction evaluation types scalar, complex and aggregate.
+ enum class EvaluationKindTy { Scalar, Complex, Aggregate };
/// Information about an OpenMP reduction.
struct ReductionInfo {
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
- ReductionGenTy ReductionGen,
- AtomicReductionGenTy AtomicReductionGen)
+ EvaluationKindTy EvaluationKind, ReductionGenCB ReductionGen,
+ ReductionGenCBClang ReductionGenClang,
+ AtomicReductionGenCB AtomicReductionGen)
: ElementType(ElementType), Variable(Variable),
- PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
+ PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
+ ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
AtomicReductionGen(AtomicReductionGen) {}
ReductionInfo(Value *PrivateVariable)
: ElementType(nullptr), Variable(nullptr),
- PrivateVariable(PrivateVariable), ReductionGen(),
- AtomicReductionGen() {}
+ PrivateVariable(PrivateVariable),
+ EvaluationKind(EvaluationKindTy::Scalar), ReductionGen(),
+ ReductionGenClang(), AtomicReductionGen() {}
/// Reduction element type, must match pointee type of variable.
Type *ElementType;
@@ -1293,22 +1316,43 @@ class OpenMPIRBuilder {
/// Thread-private partial reduction variable.
Value *PrivateVariable;
+ /// Reduction evaluation type - scalar, complex or aggregate.
+ EvaluationKindTy EvaluationKind;
+
/// Callback for generating the reduction body. The IR produced by this will
/// be used to combine two values in a thread-safe context, e.g., under
/// lock or within the same thread, and therefore need not be atomic.
- ReductionGenTy ReductionGen;
+ ReductionGenCB ReductionGen;
+
+ /// Clang callback for generating the reduction body. The IR produced by
+ /// this will be used to combine two values in a thread-safe context, e.g.,
+ /// under lock or within the same thread, and therefore need not be atomic.
+ ReductionGenCBClang ReductionGenClang;
/// Callback for generating the atomic reduction body, may be null. The IR
/// produced by this will be used to atomically combine two values during
/// reduction. If null, the implementation will use the non-atomic version
/// along with the appropriate synchronization mechanisms.
- AtomicReductionGenTy AtomicReductionGen;
+ AtomicReductionGenCB AtomicReductionGen;
+ };
+
+ enum class CopyAction : unsigned {
+ // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+ // the warp using shuffle instructions.
+ RemoteLaneToThread,
+ // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+ ThreadCopy,
+ };
+
+ struct CopyOptionsTy {
+ Value *RemoteLaneOffset = nullptr;
+ Value *ScratchpadIndex = nullptr;
+ Value *ScratchpadWidth = nullptr;
};
/// A class that manages the reduction info to facilitate lowering of
/// reductions at multiple levels of parallelism. For example handling teams
/// and parallel reductions on GPUs
-
class ReductionInfoManager {
private:
SmallVector<ReductionInfo> ReductionInfos;
@@ -1321,15 +1365,12 @@ class OpenMPIRBuilder {
PrivateVarAllocaIP.reset();
}
- Value *
- allocatePrivateReductionVar(IRBuilderBase &builder,
- llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
- Type *VarType) {
- llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
- llvm::Value *var = builder.CreateAlloca(VarType);
+ Value *allocatePrivateReductionVar(IRBuilderBase &builder,
+ InsertPointTy &allocaIP, Type *VarType) {
+ Type *ptrTy = PointerType::getUnqual(builder.getContext());
+ Value *var = builder.CreateAlloca(VarType);
var->setName("private_redvar");
- llvm::Value *castVar =
- builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+ Value *castVar = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
ReductionInfos.push_back(ReductionInfo(castVar));
return castVar;
}
@@ -1353,20 +1394,508 @@ class OpenMPIRBuilder {
void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
};
+ /// Supporting functions for Reductions CodeGen.
+private:
+ /// Emit the llvm.used metadata.
+ void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
+
+ /// Get the id of the current thread on the GPU.
+ Value *getGPUThreadID();
+
+ /// Get the GPU warp size.
+ Value *getGPUWarpSize();
+
+ /// Get the id of the warp in the block.
+ /// We assume that the warp size is 32, which is always the case
+ /// on the NVPTX device, to generate more efficient code.
+ Value *getNVPTXWarpID();
+
+ /// Get the id of the current lane in the Warp.
+ /// We assume that the warp size is 32, which is always the case
+ /// on the NVPTX device, to generate more efficient code.
+ Value *getNVPTXLaneID();
+
+ /// Cast value to the specified type.
+ Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
+
+ /// This function creates calls to one of two shuffle functions to copy
+ /// variables between lanes in a warp.
+ Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
+ Type *ElementType, Value *Offset);
+
+ void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
+ Type *ElementType, Value *Offset,
+ Type *ReductionArrayTy);
+
+ /// Emit instructions to copy a Reduce list, which contains partially
+ /// aggregated values, in the specified direction.
+ void emitReductionListCopy(
+ InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+ ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+ CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
+
+ /// Emit a helper that reduces data across two OpenMP threads (lanes)
+ /// in the same warp. It uses shuffle instructions to copy over data from
+ /// a remote lane's stack. The reduction algorithm performed is specified
+ /// by the fourth parameter.
+ ///
+ /// Algorithm Versions.
+ /// Full Warp Reduce (argument value 0):
+ /// This algorithm assumes that all 32 lanes are active and gathers
+ /// data from these 32 lanes, producing a single resultant value.
+ /// Contiguous Partial Warp Reduce (argument value 1):
+ /// This algorithm assumes that only a *contiguous* subset of lanes
+ /// are active. This happens for the last warp in a parallel region
+ /// when the user specified num_threads is not an integer multiple of
+ /// 32. This contiguous subset always starts with the zeroth lane.
+ /// Partial Warp Reduce (argument value 2):
+ /// This algorithm gathers data from any number of lanes at any position.
+ /// All reduced values are stored in the lowest possible lane. The set
+ /// of problems every algorithm addresses is a super set of those
+ /// addressable by algorithms with a lower version number. Overhead
+ /// increases as algorithm version increases.
+ ///
+ /// Terminology
+ /// Reduce element:
+ /// Reduce element refers to the individual data field with primitive
+ /// data types to be combined and reduced across threads.
+ /// Reduce list:
+ /// Reduce list refers to a collection of local, thread-private
+ /// reduce elements.
+ /// Remote Reduce list:
+ /// Remote Reduce list refers to a collection of remote (relative to
+ /// the current thread) reduce elements.
+ ///
+ /// We distinguish between three states of threads that are important to
+ /// the implementation of this function.
+ /// Alive threads:
+ /// Threads in a warp executing the SIMT instruction, as distinguished from
+ /// threads that are inactive due to divergent control flow.
+ /// Active threads:
+ /// The minimal set of threads that has to be alive upon entry to this
+ /// function. The computation is correct iff active threads are alive.
+ /// Some threads are alive but they are not active because they do not
+ /// contribute to the computation in any useful manner. Turning them off
+ /// may introduce control flow overheads without any tangible benefits.
+ /// Effective threads:
+ /// In order to comply with the argument requirements of the shuffle
+ /// function, we must keep all lanes holding data alive. But at most
+ /// half of them perform value aggregation; we refer to this half of
+ /// threads as effective. The other half is simply handing off their
+ /// data.
+ ///
+ /// Procedure
+ /// Value shuffle:
+ /// In this step active threads transfer data from higher lane positions
+ /// in the warp to lower lane positions, creating Remote Reduce list.
+ /// Value aggregation:
+ /// In this step, effective threads combine their thread local Reduce list
+ /// with Remote Reduce list and store the result in the thread local
+ /// Reduce list.
+ /// Value copy:
+ /// In this step, we deal with the assumption made by algorithm 2
+ /// (i.e. contiguity assumption). When we have an odd number of lanes
+ /// active, say 2k+1, only k threads will be effective and therefore k
+ /// new values will be produced. However, the Reduce list owned by the
+ /// (2k+1)th thread is ignored in the value aggregation. Therefore
+ /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
+ /// that the contiguity assumption still holds.
+ ///
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param ReduceFn The reduction function.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ShuffleAndReduce function.
+ Function *emitShuffleAndReduceFunction(
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ Function *ReduceFn, AttributeList FuncAttrs);
+
+ /// This function emits a helper that gathers Reduce lists from the first
+ /// lane of every active warp to lanes in the first warp.
+ ///
+ /// void inter_warp_copy_func(void* reduce_data, num_warps)
+ /// shared smem[warp_size];
+ /// For all data entries D in reduce_data:
+ /// sync
+ /// If (I am the first lane in each warp)
+ /// Copy my local D to smem[warp_id]
+ /// sync
+ /// if (I am the first warp)
+ /// Copy smem[thread_id] to my local D
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The InterWarpCopy function.
+ Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
+ ArrayRef<ReductionInfo> ReductionInfos,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that copies all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+ /// For all data entries D in reduce_data:
+ /// Copy local D to buffer.D[Idx]
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ListToGlobalCopy function.
+ Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that copies all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+ /// For all data entries D in reduce_data:
+ /// Copy buffer.D[Idx] to local D;
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The GlobalToList function.
+ Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that reduces all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+ /// void *GlobPtrs[];
+ /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+ /// ...
+ /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+ /// reduce_function(GlobPtrs, reduce_data);
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The ListToGlobalReduce function.
+ Function *
+ emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Function *ReduceFn, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// This function emits a helper that reduces all the reduction variables from
+ /// the team into the provided global buffer for the reduction variables.
+ ///
+ /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
+ /// void *GlobPtrs[];
+ /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
+ /// ...
+ /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
+ /// reduce_function(reduce_data, GlobPtrs);
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The GlobalToListReduce function.
+ Function *
+ emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+ Function *ReduceFn, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs);
+
+ /// Get the function name of a reduction function.
+ std::string getReductionFuncName(StringRef Name) const;
+
+ /// Emits reduction function.
+ /// \param ReducerName Name of the function calling the reduction.
+ /// \param ReductionInfos Array type containing the ReductionOps.
+ /// \param IsGpu Optional param to specify CodeGen for GPU Offloading.
+ /// \param FuncAttrs Optional param to specify any function attributes that
+ /// need to be copied to the new function.
+ ///
+ /// \return The reduction function.
+ Function *createReductionFunction(
+ StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsGpu = false,
+ ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ AttributeList FuncAttrs = {});
+
+public:
+ ///
+ /// Design of OpenMP reductions on the GPU
+ ///
+ /// Consider a typical OpenMP program with one or more reduction
+ /// clauses:
+ ///
+ /// float foo;
+ /// double bar;
+ /// #pragma omp target teams distribute parallel for \
+ /// reduction(+:foo) reduction(*:bar)
+ /// for (int i = 0; i < N; i++) {
+ /// foo += A[i]; bar *= B[i];
+ /// }
+ ///
+ /// where 'foo' and 'bar' are reduced across all OpenMP threads in
+ /// all teams. In our OpenMP implementation on the NVPTX device an
+ /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
+ /// within a team are mapped to CUDA threads within a threadblock.
+ /// Our goal is to efficiently aggregate values across all OpenMP
+ /// threads such that:
+ ///
+ /// - the compiler and runtime are logically concise, and
+ /// - the reduction is performed efficiently in a hierarchical
+ /// manner as follows: within OpenMP threads in the same warp,
+ /// across warps in a threadblock, and finally across teams on
+ /// the NVPTX device.
+ ///
+ /// Introduction to Decoupling
+ ///
+ /// We would like to decouple the compiler and the runtime so that the
+ /// latter is ignorant of the reduction variables (number, data types)
+ /// and the reduction operators. This allows a simpler interface
+ /// and implementation while still attaining good performance.
+ ///
+ /// Pseudocode for the aforementioned OpenMP program generated by the
+ /// compiler is as follows:
+ ///
+ /// 1. Create private copies of reduction variables on each OpenMP
+ /// thread: 'foo_private', 'bar_private'
+ /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
+ /// to it and writes the result in 'foo_private' and 'bar_private'
+ /// respectively.
+ /// 3. Call the OpenMP runtime on the GPU to reduce within a team
+ /// and store the result on the team master:
+ ///
+ /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
+ /// reduceData, shuffleReduceFn, interWarpCpyFn)
+ ///
+ /// where:
+ /// struct ReduceData {
+ /// double *foo;
+ /// double *bar;
+ /// } reduceData
+ /// reduceData.foo = &foo_private
+ /// reduceData.bar = &bar_private
+ ///
+ /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
+ /// auxiliary functions generated by the compiler that operate on
+ /// variables of type 'ReduceData'. They aid the runtime perform
+ /// algorithmic steps in a data agnostic manner.
+ ///
+ /// 'shuffleReduceFn' is a pointer to a function that reduces data
+ /// of type 'ReduceData' across two OpenMP threads (lanes) in the
+ /// same warp. It takes the following arguments as input:
+ ///
+ /// a. variable of type 'ReduceData' on the calling lane,
+ /// b. its lane_id,
+ /// c. an offset relative to the current lane_id to generate a
+ /// remote_lane_id. The remote lane contains the second
+ /// variable of type 'ReduceData' that is to be reduced.
+ /// d. an algorithm version parameter determining which reduction
+ /// algorithm to use.
+ ///
+ /// 'shuffleReduceFn' retrieves data from the remote lane using
+ /// efficient GPU shuffle intrinsics and reduces, using the
+ /// algorithm specified by the 4th parameter, the two operands
+ /// element-wise. The result is written to the first operand.
+ ///
+ /// Different reduction algorithms are implemented in different
+ /// runtime functions, all calling 'shuffleReduceFn' to perform
+ /// the essential reduction step. Therefore, based on the 4th
+ /// parameter, this function behaves slightly differently to
+ /// cooperate with the runtime to ensure correctness under
+ /// different circumstances.
+ ///
+ /// 'InterWarpCpyFn' is a pointer to a function that transfers
+ /// reduced variables across warps. It tunnels, through CUDA
+ /// shared memory, the thread-private data of type 'ReduceData'
+ /// from lane 0 of each warp to a lane in the first warp.
+ /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+ /// The last team writes the global reduced value to memory.
+ ///
+ /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
+ /// reduceData, shuffleReduceFn, interWarpCpyFn,
+ /// scratchpadCopyFn, loadAndReduceFn)
+ ///
+ /// 'scratchpadCopyFn' is a helper that stores reduced
+ /// data from the team master to a scratchpad array in
+ /// global memory.
+ ///
+ /// 'loadAndReduceFn' is a helper that loads data from
+ /// the scratchpad array and reduces it with the input
+ /// operand.
+ ///
+ /// These compiler generated functions hide address
+ /// calculation and alignment information from the runtime.
+ /// 5. if ret == 1:
+ /// The team master of the last team stores the reduced
+ /// result to the globals in memory.
+ /// foo += reduceData.foo; bar *= reduceData.bar
+ ///
+ ///
+ /// Warp Reduction Algorithms
+ ///
+ /// On the warp level, we have three algorithms implemented in the
+ /// OpenMP runtime depending on the number of active lanes:
+ ///
+ /// Full Warp Reduction
+ ///
+ /// The reduce algorithm within a warp where all lanes are active
+ /// is implemented in the runtime as follows:
+ ///
+ /// full_warp_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+ /// ShuffleReduceFn(reduce_data, 0, offset, 0);
+ /// }
+ ///
+ /// The algorithm completes in log(2, WARPSIZE) steps.
+ ///
+ /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
+ /// not used therefore we save instructions by not retrieving lane_id
+ /// from the corresponding special registers. The 4th parameter, which
+ /// represents the version of the algorithm being used, is set to 0 to
+ /// signify full warp reduction.
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// #reduce_elem refers to an element in the local lane's data structure
+ /// #remote_elem is retrieved from a remote lane
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
+ ///
+ /// Contiguous Partial Warp Reduction
+ ///
+ /// This reduce algorithm is used within a warp where only the first
+ /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
+ /// number of OpenMP threads in a parallel region is not a multiple of
+ /// WARPSIZE. The algorithm is implemented in the runtime as follows:
+ ///
+ /// void
+ /// contiguous_partial_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
+ /// int size, int lane_id) {
+ /// int curr_size;
+ /// int offset;
+ /// curr_size = size;
+ /// mask = curr_size/2;
+ /// while (offset>0) {
+ /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+ /// curr_size = (curr_size+1)/2;
+ /// offset = curr_size/2;
+ /// }
+ /// }
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// if (lane_id < offset)
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem
+ /// else
+ /// reduce_elem = remote_elem
+ ///
+ /// This algorithm assumes that the data to be reduced are located in a
+ /// contiguous subset of lanes starting from the first. When there is
+ /// an odd number of active lanes, the data in the last lane is not
+ /// aggregated with any other lane's dat but is instead copied over.
+ ///
+ /// Dispersed Partial Warp Reduction
+ ///
+ /// This algorithm is used within a warp when any discontiguous subset of
+ /// lanes are active. It is used to implement the reduction operation
+ /// across lanes in an OpenMP simd region or in a nested parallel region.
+ ///
+ /// void
+ /// dispersed_partial_reduce(void *reduce_data,
+ /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ /// int size, remote_id;
+ /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
+ /// do {
+ /// remote_id = next_active_lane_id_right_after_me();
+ /// # the above function returns 0 of no active lane
+ /// # is present right after the current lane.
+ /// size = number_of_active_lanes_in_this_warp();
+ /// logical_lane_id /= 2;
+ /// ShuffleReduceFn(reduce_data, logical_lane_id,
+ /// remote_id-1-threadIdx.x, 2);
+ /// } while (logical_lane_id % 2 == 0 && size > 1);
+ /// }
+ ///
+ /// There is no assumption made about the initial state of the reduction.
+ /// Any number of lanes (>=1) could be active at any position. The reduction
+ /// result is returned in the first active lane.
+ ///
+ /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+ ///
+ /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+ /// if (lane_id % 2 == 0 && offset > 0)
+ /// reduce_elem = reduce_elem REDUCE_OP remote_elem
+ /// else
+ /// reduce_elem = remote_elem
+ ///
+ ///
+ /// Intra-Team Reduction
+ ///
+ /// This function, as implemented in the runtime call
+ /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
+ /// threads in a team. It first reduces within a warp using the
+ /// aforementioned algorithms. We then proceed to gather all such
+ /// reduced values at the first warp.
+ ///
+ /// The runtime makes use of the function 'InterWarpCpyFn', which copies
+ /// data from each of the "warp master" (zeroth lane of each warp, where
+ /// warp-reduced data is held) to the zeroth warp. This step reduces (in
+ /// a mathematical sense) the problem of reduction across warp masters in
+ /// a block to the problem of warp reduction.
+ ///
+ ///
+ /// Inter-Team Reduction
+ ///
+ /// Once a team has reduced its data to a single value, it is stored in
+ /// a global scratchpad array. Since each team has a distinct slot, this
+ /// can be done without locking.
+ ///
+ /// The last team to write to the scratchpad array proceeds to reduce the
+ /// scratchpad array. One or more workers in the last team use the helper
+ /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+ /// the k'th worker reduces every k'th element.
+ ///
+ /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
+ /// reduce across workers and compute a globally reduced value.
+ ///
/// \param Loc The location where the reduction was
/// encountered. Must be within the associate
/// directive and after the last local access to the
/// reduction variables.
/// \param AllocaIP An insertion point suitable for allocas usable
/// in reductions.
+ /// \param AllocaIP An insertion point suitable for code generation.
/// \param ReductionInfos A list of info on each reduction variable.
- /// \param IsNoWait A flag set if the reduction is marked as nowait.
- InsertPointTy createReductionsGPU(const LocationDescription &Loc,
- InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait = false,
- bool IsTeamsReduction = false,
- bool HasDistribute = false);
+ /// \param IsNoWait Optional flag set if the reduction is marked as
+ /// nowait.
+ /// \param IsTeamsReduction Optional flag set if it is a teams
+ /// reduction.
+ /// \param HasDistribute Optional flag set if it is a
+ /// distribute reduction.
+ /// \param GridValue Optional GPU grid value.
+ /// \param GridValue Optional GPU grid value.
+ /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
+ /// used for teams reduction.
+ InsertPointTy createReductionsGPU(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait = false, bool IsTeamsReduction = false,
+ bool HasDistribute = false,
+ ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
+ Value *SrcLocInfo = nullptr);
// TODO: provide atomic and non-atomic reduction generators for reduction
// operators defined by the OpenMP specification.
@@ -1437,9 +1966,6 @@ class OpenMPIRBuilder {
bool IsNoWait = false, bool IsByRef = false,
bool IsTeamsReduction = false,
bool HasDistribute = false);
-
- ///}
-
/// Return the insertion point used by the underlying IRBuilder.
InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
@@ -1513,19 +2039,6 @@ class OpenMPIRBuilder {
Value *NumThreads, Value *HostPtr,
ArrayRef<Value *> KernelArgs);
- /// Generate a barrier runtime call.
- ///
- /// \param Loc The location at which the request originated and is fulfilled.
- /// \param DK The directive which caused the barrier
- /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
- /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
- /// should be checked and acted upon.
- ///
- /// \returns The insertion point after the barrier.
- InsertPointTy emitBarrierImpl(const LocationDescription &Loc,
- omp::Directive DK, bool ForceSimpleCall,
- bool CheckCancelFlag);
-
/// Generate a flush runtime call.
///
/// \param Loc The location at which the request originated and is fulfilled.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2b48df24e7e734..435af68e344a8a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -147,15 +147,6 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
Function *GLOBAL_ReductionFunc = nullptr;
-static uint64_t getTypeSizeInBytes(Module &M, Type *Type) {
- return divideCeil(M.getDataLayout().getTypeSizeInBits(Type), 8);
-}
-
-static Value *getTypeSizeInBytesValue(IRBuilder<> &Builder, Module &M,
- Type *Type) {
- return Builder.getInt64(getTypeSizeInBytes(M, Type));
-}
-
static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
if (T.isAMDGPU()) {
if (Features.count("+wavefrontsize64"))
@@ -812,6 +803,12 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
if (!OffloadInfoManager.empty())
createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+ if (Config.EmitLLVMUsed) {
+ std::vector<WeakTrackingVH> LLVMCompilerUsed = {
+ M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
+ emitUsed("llvm.compiler.used", LLVMCompilerUsed);
+ }
}
OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -944,16 +941,12 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
}
OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
- bool ForceSimpleCall, bool CheckCancelFlag) {
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
+ bool ForceSimpleCall, bool CheckCancelFlag,
+ Value *ThreadID) {
if (!updateToLocation(Loc))
return Loc.IP;
- return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
-}
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
- bool ForceSimpleCall, bool CheckCancelFlag) {
// Build call __kmpc_cancel_barrier(loc, thread_id) or
// __kmpc_barrier(loc, thread_id);
@@ -978,9 +971,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- Value *Args[] = {
- getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
- getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
+ if (!ThreadID)
+ ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+
+ Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+ ThreadID};
// If we are in a cancellable parallel region, barriers are cancellation
// points.
@@ -2116,219 +2111,273 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
/*IsCancellable*/ true);
}
-static Value *getGPUWarpSize(Module &M, OpenMPIRBuilder &OMPBuilder) {
- return OMPBuilder.Builder.CreateCall(
- OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size),
- {});
+static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
+ BasicBlock::iterator IT(I);
+ IT++;
+ return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
}
-static Value *getGPUThreadID(Module &M, OpenMPIRBuilder &OMPBuilder) {
- return OMPBuilder.Builder.CreateCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_get_hardware_thread_id_in_block),
- {});
+void OpenMPIRBuilder::emitUsed(StringRef Name,
+ std::vector<WeakTrackingVH> &List) {
+ if (List.empty())
+ return;
+
+ // Convert List to what ConstantArray needs.
+ SmallVector<Constant *, 8> UsedArray;
+ UsedArray.resize(List.size());
+ for (unsigned I = 0, E = List.size(); I != E; ++I)
+ UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+ if (UsedArray.empty())
+ return;
+ ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+ auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, UsedArray), Name);
+
+ GV->setSection("llvm.metadata");
}
-static Value *getGPUNumThreads(Module &M, OpenMPIRBuilder &OMPBuilder) {
- const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
- llvm::Function *F = M.getFunction(LocSize);
- if (!F) {
- LLVMContext &Ctx = M.getContext();
- Type *I32Type = Type::getInt32Ty(Ctx);
+Value *OpenMPIRBuilder::getGPUThreadID() {
+ return Builder.CreateCall(
+ getOrCreateRuntimeFunction(M,
+ OMPRTL___kmpc_get_hardware_thread_id_in_block),
+ {});
+}
- F = Function::Create(FunctionType::get(I32Type, std::nullopt, false),
- GlobalVariable::ExternalLinkage, LocSize, M);
- }
- return OMPBuilder.Builder.CreateCall(F, std::nullopt, "nvptx_num_threads");
+Value *OpenMPIRBuilder::getGPUWarpSize() {
+ return Builder.CreateCall(
+ getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
}
-static Value *getNVPTXWarpID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
- unsigned LaneIDBits =
- llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
- return OMPIRBuilder.Builder.CreateAShr(getGPUThreadID(M, OMPIRBuilder),
- LaneIDBits, "nvptx_warp_id");
+Value *OpenMPIRBuilder::getNVPTXWarpID() {
+ unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+ return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
}
-static Value *getNVPTXLaneID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
- unsigned LaneIDBits =
- llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+Value *OpenMPIRBuilder::getNVPTXLaneID() {
+ unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- return OMPIRBuilder.Builder.CreateAnd(
- getGPUThreadID(M, OMPIRBuilder),
- OMPIRBuilder.Builder.getInt32(LaneIDMask), "nvptx_lane_id");
+ return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
+ "nvptx_lane_id");
}
-namespace {
-enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
-};
-
-static Value *castValueToType(Module &M, OpenMPIRBuilder &OMPBuilder,
- Value *From, Type *ToType,
- OpenMPIRBuilder::InsertPointTy AllocaIP,
- const OpenMPIRBuilder::LocationDescription &Loc) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
+Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
+ Type *ToType) {
Type *FromType = From->getType();
- uint64_t FromSize =
- divideCeil(M.getDataLayout().getTypeSizeInBits(FromType), 8);
- uint64_t ToSize = divideCeil(M.getDataLayout().getTypeSizeInBits(ToType), 8);
+ uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
+ uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
assert(FromSize > 0 && "From size must be greater than zero");
- assert(ToSize > 0 && "From size must be greater than zero");
+ assert(ToSize > 0 && "To size must be greater than zero");
if (FromType == ToType)
return From;
if (FromSize == ToSize)
return Builder.CreateBitCast(From, ToType);
if (ToType->isIntegerTy() && FromType->isIntegerTy())
- // FIXME(JAN): Assuming signed integer here, not sure how to find out
- // if unsigned
return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
- OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+ InsertPointTy SaveIP = Builder.saveIP();
Builder.restoreIP(AllocaIP);
- Value *CastItem = Builder.CreateAlloca(ToType, nullptr, "cast_tmp");
- Builder.restoreIP(CurIP);
+ Value *CastItem = Builder.CreateAlloca(ToType);
+ Builder.restoreIP(SaveIP);
Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
- CastItem, FromType->getPointerTo(), "valcastitem");
+ CastItem, FromType->getPointerTo());
Builder.CreateStore(From, ValCastItem);
- return Builder.CreateLoad(ToType, CastItem, "castitemload");
+ return Builder.CreateLoad(ToType, CastItem);
}
-static Value *
-createRuntimeShuffleFunction(Module &M, OpenMPIRBuilder &OMPBuilder,
- const OpenMPIRBuilder::LocationDescription &Loc,
- OpenMPIRBuilder::InsertPointTy AllocaIP,
- Value *Element, Type *ElementType, Value *Offset) {
- LLVMContext &Ctx = M.getContext();
- IRBuilder<> &Builder = OMPBuilder.Builder;
- uint64_t Size =
- divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
+ Value *Element,
+ Type *ElementType,
+ Value *Offset) {
+ uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
- Function *ShuffleFunc = OMPBuilder.getOrCreateRuntimeFunctionPtr(
+
+ // Cast all types to 32- or 64-bit values before calling shuffle routines.
+ Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+ Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
+ Value *WarpSize =
+ Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
+ Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
: RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
- Type *IntType = Builder.getIntNTy(Size <= 4 ? 32 : 64);
- Value *ElemCast = Builder.CreateCast(Instruction::SExt, Element, IntType);
- Value *WarpSize = getGPUWarpSize(M, OMPBuilder);
Value *WarpSizeCast =
- Builder.CreateIntCast(WarpSize, Type::getInt16Ty(Ctx), /*isSigned=*/true);
+ Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
Value *ShuffleCall =
Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
- return castValueToType(M, OMPBuilder, ShuffleCall, IntType, AllocaIP, Loc);
-}
-
-static void shuffleAndStore(Value *SrcAddr, Value *DstAddr, Type *ElementType,
- llvm::Value *Offset, Type *ReductionArrayTy,
- const OpenMPIRBuilder::LocationDescription &Loc,
- Module &M, OpenMPIRBuilder &OMPBuilder,
- OpenMPIRBuilder::InsertPointTy AllocaIP) {
- LLVMContext &Ctx = M.getContext();
- IRBuilder<> &Builder = OMPBuilder.Builder;
- uint64_t Size =
- divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
- Type *PtrTy = PointerType::getUnqual(Ctx);
+ return castValueToType(AllocaIP, ShuffleCall, CastTy);
+}
+
+void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
+ Value *DstAddr, Type *ElemType,
+ Value *Offset, Type *ReductionArrayTy) {
+ uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
+ // Create the loop over the big sized data.
+ // ptr = (void*)Elem;
+ // ptrEnd = (void*) Elem + 1;
+ // Step = 8;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int64_t)*ptr);
+ // Step = 4;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int32_t)*ptr);
+ // ...
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
Value *ElemPtr = DstAddr;
Value *Ptr = SrcAddr;
- // Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
- // Builder.CreateConstGEP1_64(ReductionArrayTy, SrcAddr, 1), PtrTy);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
+ for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
if (Size < IntSize)
continue;
- // FIXME(JAN): Check if there is a function to convert from bytes to bits
- Type *IntTy = Builder.getIntNTy(IntSize * 8);
+ Type *IntType = Builder.getIntNTy(IntSize * 8);
Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- Ptr, IntTy->getPointerTo(), "ptrcast");
+ Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+ Value *SrcAddrGEP =
+ Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtr, IntTy->getPointerTo(), "elemptrcast");
-
- // FIXME(JAN): Implement loop to handle larger size
- assert(((Size / IntSize) <= 1) && "Unsupported IntSize");
- Value *Val = Builder.CreateLoad(IntTy, Ptr);
- Value *Res = createRuntimeShuffleFunction(M, OMPBuilder, Loc, AllocaIP, Val,
- IntTy, Offset);
- Builder.CreateStore(Res, ElemPtr);
- Ptr = Builder.CreateConstGEP1_64(ReductionArrayTy, Ptr, 1, "ptrgep");
- ElemPtr =
- Builder.CreateConstGEP1_64(ReductionArrayTy, ElemPtr, 1, "elemptrgep");
+ ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+
+ Function *CurFunc = Builder.GetInsertBlock()->getParent();
+ if ((Size / IntSize) > 1) {
+ Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ SrcAddrGEP, Builder.getPtrTy());
+ BasicBlock *PreCondBB =
+ BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
+ BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
+ BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
+ BasicBlock *CurrentBB = Builder.GetInsertBlock();
+ emitBlock(PreCondBB, CurFunc);
+ PHINode *PhiSrc =
+ Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
+ PhiSrc->addIncoming(Ptr, CurrentBB);
+ PHINode *PhiDest =
+ Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
+ PhiDest->addIncoming(ElemPtr, CurrentBB);
+ Ptr = PhiSrc;
+ ElemPtr = PhiDest;
+ Value *PtrDiff = Builder.CreatePtrDiff(
+ Builder.getInt8Ty(), PtrEnd,
+ Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
+ Builder.CreateCondBr(
+ Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
+ ExitBB);
+ emitBlock(ThenBB, CurFunc);
+ Value *Res = createRuntimeShuffleFunction(
+ AllocaIP,
+ Builder.CreateAlignedLoad(
+ IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
+ IntType, Offset);
+ Builder.CreateAlignedStore(Res, ElemPtr,
+ M.getDataLayout().getPrefTypeAlign(ElemType));
+ Value *LocalPtr =
+ Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+ Value *LocalElemPtr =
+ Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+ PhiSrc->addIncoming(LocalPtr, ThenBB);
+ PhiDest->addIncoming(LocalElemPtr, ThenBB);
+ emitBranch(PreCondBB);
+ emitBlock(ExitBB, CurFunc);
+ } else {
+ Value *Res = createRuntimeShuffleFunction(
+ AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
+ if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
+ Res->getType()->getScalarSizeInBits())
+ Res = Builder.CreateTrunc(Res, ElemType);
+ Builder.CreateStore(Res, ElemPtr);
+ Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+ ElemPtr =
+ Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+ }
Size = Size % IntSize;
}
}
-static void
-emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- Value *SrcBase, Value *DestBase, Module &M,
- OpenMPIRBuilder &OMPBuilder,
- const OpenMPIRBuilder::LocationDescription &Loc,
- OpenMPIRBuilder::InsertPointTy AllocaIP,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
- LLVMContext &Ctx = M.getContext();
- IRBuilder<> &Builder = OMPBuilder.Builder;
- Type *PtrTy = PointerType::getUnqual(Ctx);
-
+void OpenMPIRBuilder::emitReductionListCopy(
+ InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+ ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+ CopyOptionsTy CopyOptions) {
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+ // Iterates, element-by-element, through the source Reduce list and
+ // make a copy.
for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ const ReductionInfo &RI = En.value();
Value *SrcElementAddr = nullptr;
Value *DestElementAddr = nullptr;
Value *DestElementPtrAddr = nullptr;
+ // Should we shuffle in an element from a remote lane?
bool ShuffleInElement = false;
+ // Set to true to update the pointer in the dest Reduce list to a
+ // newly created element.
bool UpdateDestListPtr = false;
// Step 1.1: Get the address for the src element in the Reduce list.
- Value *SrcElementPtrAddr = Builder.CreateConstGEP2_64(
- ReductionArrayTy, SrcBase, 0, En.index(), "srcelementptraddr");
- SrcElementAddr =
- Builder.CreateLoad(PtrTy, SrcElementPtrAddr, "srcelementaddr");
+ Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
+ ReductionArrayTy, SrcBase,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
// Step 1.2: Create a temporary to store the element in the destination
// Reduce list.
DestElementPtrAddr = Builder.CreateInBoundsGEP(
ReductionArrayTy, DestBase,
- {Builder.getInt64(0), Builder.getInt64(En.index())},
- "destelementptraddr");
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
switch (Action) {
- case RemoteLaneToThread: {
- OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+ case CopyAction::RemoteLaneToThread: {
+ InsertPointTy CurIP = Builder.saveIP();
Builder.restoreIP(AllocaIP);
- DestElementAddr = Builder.CreateAlloca(RI.ElementType, nullptr,
- ".omp.reduction.element");
+ AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
+ ".omp.reduction.element");
+ DestAlloca->setAlignment(
+ M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+ DestElementAddr = DestAlloca;
+ DestElementAddr =
+ Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
+ DestElementAddr->getName() + ".ascast");
Builder.restoreIP(CurIP);
ShuffleInElement = true;
UpdateDestListPtr = true;
break;
}
- case ThreadCopy: {
+ case CopyAction::ThreadCopy: {
DestElementAddr =
- Builder.CreateLoad(PtrTy, DestElementPtrAddr, "destelementaddr");
+ Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
break;
}
}
- // FIXME(JAN): Original code in clanguses <Addr>.withElementType(...)
- // check if this generates any code
-
+ // Now that all active lanes have read the element in the
+ // Reduce list, shuffle over the value from the remote lane.
if (ShuffleInElement) {
- shuffleAndStore(SrcElementAddr, DestElementAddr, RI.ElementType,
- RemoteLaneOffset, ReductionArrayTy, Loc, M, OMPBuilder,
- AllocaIP);
+ shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
+ RemoteLaneOffset, ReductionArrayTy);
} else {
- // FIXME(JAN): Assume Scalar here (TEK_Scalar in Clang)
- Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
- Builder.CreateStore(Elem, DestElementAddr);
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+ // Store the source element value to the dest element address.
+ Builder.CreateStore(Elem, DestElementAddr);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ break;
+ }
+ case EvaluationKindTy::Aggregate: {
+ Value *SizeVal = Builder.getInt64(
+ M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SizeVal, false);
+ break;
+ }
+ };
}
+
// Step 3.1: Modify reference in dest Reduce list as needed.
// Modifying the reference in Reduce list to point to the newly
// created element. The element is live in the current function
@@ -2336,443 +2385,500 @@ emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
// RemoteReduceData[i] = (void*)&RemoteElem
if (UpdateDestListPtr) {
Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr, PtrTy, "castdestaddr");
+ DestElementAddr, Builder.getPtrTy(),
+ DestElementAddr->getName() + ".ascast");
Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
}
}
}
-static OpenMPIRBuilder::InsertPointTy getIPAfterInstr(Instruction *I) {
- BasicBlock::iterator it(I);
- it++;
- return OpenMPIRBuilder::InsertPointTy(I->getParent(), it);
-}
-
-static Function *emitShuffleAndReduceFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
-
- LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *PtrTy = PointerType::getUnqual(Ctx);
- Type *I16Type = Type::getInt16Ty(Ctx);
- auto FuncTy = FunctionType::get(VoidTy, {PtrTy, I16Type, I16Type, I16Type},
- /* IsVarArg */ false);
- Function *SarFunc =
- Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- "_omp_reduction_shuffle_and_reduce_func", &M);
- SarFunc->setDoesNotRecurse();
-
- // Set arg names
- Argument *Arg0 = SarFunc->getArg(0);
- Argument *Arg1 = SarFunc->getArg(1);
- Argument *Arg2 = SarFunc->getArg(2);
- Argument *Arg3 = SarFunc->getArg(3);
- Arg0->setName("reduce_list_arg");
- Arg1->setName("lane_id_arg");
- Arg2->setName("remote_lane_offset_arg");
- Arg3->setName("algo_ver_arg");
-
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", SarFunc);
- Builder.SetInsertPoint(EntryBlock);
-
- Type *Arg0Type = Arg0->getType();
- Type *ArgNType = Arg1->getType();
- Type *ArgNPtrType = Arg1->getType()->getPointerTo();
- Value *ReduceListAlloca =
- Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
- Value *LaneIdAlloca =
- Builder.CreateAlloca(ArgNType, nullptr, Arg1->getName() + ".addr");
- Value *RemoteLaneOffsetAlloca =
- Builder.CreateAlloca(ArgNType, nullptr, Arg2->getName() + ".addr");
- Value *AlgoVerAlloca =
- Builder.CreateAlloca(ArgNType, nullptr, Arg3->getName() + ".addr");
- // FIXME(Jan): Compute reduction list array type
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
- Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
- RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
-
- Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
- Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LaneIdAlloca, ArgNPtrType, LaneIdAlloca->getName() + ".acast");
- Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- RemoteLaneOffsetAlloca, ArgNPtrType,
- RemoteLaneOffsetAlloca->getName() + ".acast");
- Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- AlgoVerAlloca, ArgNPtrType, AlgoVerAlloca->getName() + ".acast");
- Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- RemoteReductionListAlloca, PtrTy,
- RemoteReductionListAlloca->getName() + ".acast");
-
- Builder.CreateStore(Arg0, ReduceListAddrCast);
- Builder.CreateStore(Arg1, LaneIdAddrCast);
- Builder.CreateStore(Arg2, RemoteLaneOffsetAddrCast);
- Builder.CreateStore(Arg3, AlgoVerAddrCast);
-
- Value *ReduceList =
- Builder.CreateLoad(Arg0Type, ReduceListAddrCast, "reduce_list");
- Value *LaneId = Builder.CreateLoad(ArgNType, LaneIdAddrCast, "lane_id");
- Value *RemoteLaneOffset = Builder.CreateLoad(
- ArgNType, RemoteLaneOffsetAddrCast, "remote_lane_offset");
- Value *AlgoVer = Builder.CreateLoad(ArgNType, AlgoVerAddrCast, "algo_ver");
-
- OpenMPIRBuilder::InsertPointTy AllocaIP =
- getIPAfterInstr(RemoteReductionListAlloca);
- emitReductionListCopy(RemoteLaneToThread, RedListArrayTy, ReductionInfos,
- ReduceList, RemoteListAddrCast, M, OMPBuilder, Loc,
- AllocaIP, {RemoteLaneOffset, nullptr, nullptr});
-
- // The actions to be performed on the Remote Reduce list is dependent
- // on the algorithm version.
- //
- // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
- // LaneId % 2 == 0 && Offset > 0):
- // do the reduction value aggregation
- //
- // The thread local variable Reduce list is mutated in place to host the
- // reduced data, which is the aggregated value produced from local and
- // remote lanes.
- //
- // Note that AlgoVer is expected to be a constant integer known at compile
- // time.
- // When AlgoVer==0, the first conjunction evaluates to true, making
- // the entire predicate true during compile time.
- // When AlgoVer==1, the second conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- // When AlgoVer==2, the third conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
- Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
- Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
- Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
- Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
- Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
- Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
- Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
- Value *RemoteOffsetComp =
- Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
- Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
- Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
- Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
-
- BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", SarFunc);
- BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", SarFunc);
- BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", SarFunc);
-
- Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
- Builder.SetInsertPoint(ThenBB);
- // reduce_function(LocalReduceList, RemoteReduceList)
- Value *LocalReduceListPtr =
- Builder.CreatePointerBitCastOrAddrSpaceCast(ReduceList, PtrTy);
- Value *RemoteReduceListPtr =
- Builder.CreatePointerBitCastOrAddrSpaceCast(RemoteListAddrCast, PtrTy);
- Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
- Builder.CreateBr(MergeBB);
- Builder.SetInsertPoint(ElseBB);
- Builder.CreateBr(MergeBB);
- Builder.SetInsertPoint(MergeBB);
-
- Value *Algo1_2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
- Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
- Value *CondCopy = Builder.CreateAnd(Algo1_2, LaneIdGtOffset);
-
- BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "cpy_then", SarFunc);
- BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "cpy_else", SarFunc);
- BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "cpy_ifcont", SarFunc);
-
- Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
- Builder.SetInsertPoint(CpyThenBB);
- emitReductionListCopy(ThreadCopy, RedListArrayTy, ReductionInfos,
- RemoteListAddrCast, ReduceList, M, OMPBuilder, Loc,
- AllocaIP);
- Builder.CreateBr(CpyMergeBB);
- Builder.SetInsertPoint(CpyElseBB);
- Builder.CreateBr(CpyMergeBB);
- Builder.SetInsertPoint(CpyMergeBB);
- Builder.CreateRetVoid();
-
- return SarFunc;
-}
-
-static Function *emitInterWarpCopyFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
- OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
+ const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
+ AttributeList FuncAttrs) {
+ InsertPointTy SavedIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *PtrTy = PointerType::getUnqual(Ctx);
- Type *I32Type = Type::getInt32Ty(Ctx);
- auto FuncTy =
- FunctionType::get(VoidTy, {PtrTy, I32Type}, /* IsVarArg */ false);
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
+ /* IsVarArg */ false);
Function *WcFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage,
"_omp_reduction_inter_warp_copy_func", &M);
- WcFunc->setDoesNotRecurse();
-
- // Set arg names
- Argument *Arg0 = WcFunc->getArg(0);
- Argument *Arg1 = WcFunc->getArg(1);
- Arg0->setName("reduce_list");
- Arg1->setName("num_warps");
-
- // Ensure data transfer storage
- unsigned WarpSize = OMPBuilder.Config.getGridValue().GV_Warp_Size;
- // FIXME(Jan): Not sure about the array type here, but it is I32 in Clang
- auto *ArrayTy = ArrayType::get(I32Type, WarpSize);
+ WcFunc->setAttributes(FuncAttrs);
+ WcFunc->addParamAttr(0, Attribute::NoUndef);
+ WcFunc->addParamAttr(1, Attribute::NoUndef);
+ BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
+ Builder.SetInsertPoint(EntryBB);
+
+ // ReduceList: thread local Reduce list.
+ // At the stage of the computation when this function is called, partially
+ // aggregated values reside in the first lane of every active warp.
+ Argument *ReduceListArg = WcFunc->getArg(0);
+ // NumWarps: number of warps active in the parallel region. This could
+ // be smaller than 32 (max warps in a CTA) for partial block reduction.
+ Argument *NumWarpsArg = WcFunc->getArg(1);
+
+ // This array is used as a medium to transfer, one reduce element at a time,
+ // the data from the first lane of every warp to lanes in the first warp
+ // in order to perform the final step of a reduction in a parallel region
+ // (reduction across warps). The array is placed in NVPTX __shared__ memory
+ // for reduced latency, as well as to have a distinct copy for concurrently
+ // executing target regions. The array is declared with common linkage so
+ // as to be shared across compilation units.
StringRef TransferMediumName =
"__openmp_nvptx_data_transfer_temporary_storage";
GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+ unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
+ ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
if (!TransferMedium) {
- unsigned SharedAddressSpace =
- 3; /* FIXME(Jan): C.getTargetAddressSpace(LangAS::cuda_shared); */
TransferMedium = new GlobalVariable(
M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
UndefValue::get(ArrayTy), TransferMediumName,
/*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
+ /*AddressSpace=*/3);
}
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", WcFunc);
- Builder.SetInsertPoint(EntryBlock);
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- Type *Arg0Type = Arg0->getType();
- Type *Arg1Type = Arg1->getType();
- Value *ReduceListAlloca =
- Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
- Instruction *NumWarpsAlloca =
- Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ // Get the CUDA thread id of the current OpenMP thread on the GPU.
+ Value *GPUThreadID = getGPUThreadID();
+ // nvptx_lane_id = nvptx_id % warpsize
+ Value *LaneID = getNVPTXLaneID();
+ // nvptx_warp_id = nvptx_id / warpsize
+ Value *WarpID = getNVPTXWarpID();
+
+ InsertPointTy AllocaIP =
+ InsertPointTy(Builder.GetInsertBlock(),
+ Builder.GetInsertBlock()->getFirstInsertionPt());
+ Type *Arg0Type = ReduceListArg->getType();
+ Type *Arg1Type = NumWarpsArg->getType();
+ Builder.restoreIP(AllocaIP);
+ AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
+ Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
+ AllocaInst *NumWarpsAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
+ Value *ThreadID =
+ getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+ ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
NumWarpsAlloca, Arg1Type->getPointerTo(),
- NumWarpsAlloca->getName() + ".acast");
- Builder.CreateStore(Arg0, ReduceListAddrCast);
- Builder.CreateStore(Arg1, NumWarpsAddrCast);
-
- // Get GPU Info
- Value *ThreadID = getGPUThreadID(M, OMPBuilder);
- Value *LaneID = getNVPTXLaneID(M, OMPBuilder);
- Value *WarpID = getNVPTXWarpID(M, OMPBuilder);
+ NumWarpsAlloca->getName() + ".ascast");
+ Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+ Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
+ AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
+ InsertPointTy CodeGenIP =
+ getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
+ Builder.restoreIP(CodeGenIP);
- Value *ReduceListArg =
- Builder.CreateLoad(PtrTy, ReduceListAddrCast, "reduce_list_arg");
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- Type *ElementTy = RI.ElementType;
- unsigned NumTypeBits = M.getDataLayout().getTypeSizeInBits(ElementTy);
- unsigned RealTySize = divideCeil(NumTypeBits, 8);
+ //
+ // Warp master copies reduce element to transfer medium in __shared__
+ // memory.
+ //
+ const ReductionInfo &RI = En.value();
+ unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+ Type *CType = Builder.getIntNTy(TySize * 8);
+
unsigned NumIters = RealTySize / TySize;
if (NumIters == 0)
continue;
- // Type *CopyTy = Builder.getIntNTy(TySize);
- Type *Int32Ty = Builder.getInt32Ty();
Value *Cnt = nullptr;
- Value *CntAddrAcast = nullptr;
+ Value *CntAddr = nullptr;
BasicBlock *PrecondBB = nullptr;
BasicBlock *ExitBB = nullptr;
-
if (NumIters > 1) {
- OpenMPIRBuilder::InsertPointTy CurrIP = Builder.saveIP();
- Builder.SetInsertPoint(NumWarpsAlloca);
- Value *CntAddr = Builder.CreateAlloca(Int32Ty, nullptr, ".cnt.addr");
- Builder.restoreIP(CurrIP);
- CntAddrAcast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- CntAddr, PtrTy, CntAddr->getName() + ".acast");
- Builder.CreateStore(Constant::getNullValue(Int32Ty), CntAddrAcast);
- PrecondBB = BasicBlock::Create(Ctx, "precond", WcFunc);
- ExitBB = BasicBlock::Create(Ctx, "exit", WcFunc);
- BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body", WcFunc);
- Builder.CreateBr(PrecondBB);
- Builder.SetInsertPoint(PrecondBB);
- Cnt = Builder.CreateLoad(Int32Ty, CntAddrAcast, "cnt");
- Value *Cmp = Builder.CreateICmpULT(Cnt, Builder.getInt32(NumIters));
+ CodeGenIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+ CntAddr =
+ Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
+
+ CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
+ CntAddr->getName() + ".ascast");
+ Builder.restoreIP(CodeGenIP);
+ Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
+ CntAddr,
+ /*Volatile=*/false);
+ PrecondBB = BasicBlock::Create(Ctx, "precond");
+ ExitBB = BasicBlock::Create(Ctx, "exit");
+ BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
+ emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
+ Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
+ /*Volatile=*/false);
+ Value *Cmp = Builder.CreateICmpULT(
+ Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
- Builder.SetInsertPoint(BodyBB);
+ emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
}
- OMPBuilder.createBarrier(
- OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
- omp::Directive::OMPD_unknown,
- /* ForceSimpleCall */ false,
- /* CheckCancelFlag */ true);
- BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", WcFunc);
- BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", WcFunc);
- BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", WcFunc);
+ // kmpc_barrier.
+ createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true, ThreadID);
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
// if (lane_id == 0)
Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+ emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
- // then
// Reduce element = LocalReduceList[i]
- Builder.SetInsertPoint(ThenBB);
- // FIXME(JAN): Should array type be passed in?
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
- // FIXME(JAN): maybe it should be 0,0 and not use En.index()
- Value *ReduceListElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, ReduceListArg, 0, En.index());
- Value *ReduceListElementPtr = Builder.CreateLoad(
- PtrTy, ReduceListElementPtrPtr, "reduce_list_element_ptr");
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ Value *ElemPtrPtr =
+ Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+ {ConstantInt::get(IndexTy, 0),
+ ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
if (NumIters > 1)
- ReduceListElementPtr =
- Builder.CreateGEP(Int32Ty, ReduceListElementPtr, Cnt);
+ ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
- Value *TransferElemAddr = Builder.CreateInBoundsGEP(
+ // Get pointer to location in transfer medium.
+ // MediumPtr = &medium[warp_id]
+ Value *MediumPtr = Builder.CreateInBoundsGEP(
ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
- Value *ReduceListElement = Builder.CreateLoad(
- I32Type, ReduceListElementPtr, "reduce_list_element");
- Builder.CreateStore(ReduceListElement, TransferElemAddr,
+ // elem = *elemptr
+ //*MediumPtr = elem
+ Value *Elem = Builder.CreateLoad(CType, ElemPtr);
+ // Store the source element value to the dest element address.
+ Builder.CreateStore(Elem, MediumPtr,
/*IsVolatile*/ true);
Builder.CreateBr(MergeBB);
// else
- Builder.SetInsertPoint(ElseBB);
+ emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
Builder.CreateBr(MergeBB);
// endif
- Builder.SetInsertPoint(MergeBB);
- OMPBuilder.createBarrier(
- OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
- omp::Directive::OMPD_unknown,
- /* ForceSimpleCall */ false,
- /* CheckCancelFlag */ true);
+ emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+ createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_unknown,
+ /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ true, ThreadID);
// Warp 0 copies reduce element from transfer medium
- BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "w0then", WcFunc);
- BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "w0else", WcFunc);
- BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "w0ifcont", WcFunc);
+ BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
Value *NumWarpsVal =
- Builder.CreateLoad(I32Type, NumWarpsAddrCast, "num_warps");
+ Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
+ // Up to 32 threads in warp 0 are active.
Value *IsActiveThread =
- Builder.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+ Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
- // W0then
- // SecMEdiumPtr = &medium[tid]
- Builder.SetInsertPoint(W0ThenBB);
- Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
- ArrayTy, TransferMedium, {Builder.getInt64(0), ThreadID});
+ emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
+
+ // SecMediumPtr = &medium[tid]
// SrcMediumVal = *SrcMediumPtr
- // TODO(JAN): Bitcast here, but no load? skipping for now
- Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, ReduceListArg, 0, En.index());
- Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+ Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+ ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
+ // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+ Value *TargetElemPtrPtr =
+ Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+ {ConstantInt::get(IndexTy, 0),
+ ConstantInt::get(IndexTy, En.index())});
+ Value *TargetElemPtrVal =
+ Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
+ Value *TargetElemPtr = TargetElemPtrVal;
if (NumIters > 1)
- TargetElementPtr = Builder.CreateGEP(Int32Ty, TargetElementPtr, Cnt);
+ TargetElemPtr =
+ Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
+ // *TargetElemPtr = SrcMediumVal;
Value *SrcMediumValue =
- Builder.CreateLoad(I32Type, SrcMediumPtrVal, /*IsVolatile*/ true);
- Builder.CreateStore(SrcMediumValue, TargetElementPtr);
+ Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
+ Builder.CreateStore(SrcMediumValue, TargetElemPtr);
Builder.CreateBr(W0MergeBB);
- // W0else
- Builder.SetInsertPoint(W0ElseBB);
+ emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
Builder.CreateBr(W0MergeBB);
- // W0endif
- Builder.SetInsertPoint(W0MergeBB);
+ emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
+
if (NumIters > 1) {
- Cnt = Builder.CreateNSWAdd(Cnt, Builder.getInt32(1));
- Builder.CreateStore(Cnt, CntAddrAcast);
- Builder.CreateBr(PrecondBB);
- Builder.SetInsertPoint(ExitBB);
+ Cnt = Builder.CreateNSWAdd(
+ Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
+ Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
+
+ auto *CurFn = Builder.GetInsertBlock()->getParent();
+ emitBranch(PrecondBB);
+ emitBlock(ExitBB, CurFn);
}
+ RealTySize %= TySize;
}
}
Builder.CreateRetVoid();
- Builder.restoreIP(OldIP);
+ Builder.restoreIP(SavedIP);
+
return WcFunc;
}
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy local D to buffer.D[Idx]
-static Function *emitListToGlobalCopyFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ AttributeList FuncAttrs) {
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FuncTy =
+ FunctionType::get(Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt16Ty(),
+ Builder.getInt16Ty(), Builder.getInt16Ty()},
+ /* IsVarArg */ false);
+ Function *SarFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ "_omp_reduction_shuffle_and_reduce_func", &M);
+ SarFunc->setAttributes(FuncAttrs);
+ SarFunc->addParamAttr(0, Attribute::NoUndef);
+ SarFunc->addParamAttr(1, Attribute::NoUndef);
+ SarFunc->addParamAttr(2, Attribute::NoUndef);
+ SarFunc->addParamAttr(3, Attribute::NoUndef);
+ SarFunc->addParamAttr(1, Attribute::SExt);
+ SarFunc->addParamAttr(2, Attribute::SExt);
+ SarFunc->addParamAttr(3, Attribute::SExt);
+ BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
+ Builder.SetInsertPoint(EntryBB);
+
+ // Thread local Reduce list used to host the values of data to be reduced.
+ Argument *ReduceListArg = SarFunc->getArg(0);
+ // Current lane id; could be logical.
+ Argument *LaneIDArg = SarFunc->getArg(1);
+ // Offset of the remote source lane relative to the current lane.
+ Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
+ // Algorithm version. This is expected to be known at compile time.
+ Argument *AlgoVerArg = SarFunc->getArg(3);
+
+ Type *ReduceListArgType = ReduceListArg->getType();
+ Type *LaneIDArgType = LaneIDArg->getType();
+ Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+ Value *ReduceListAlloca = Builder.CreateAlloca(
+ ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
+ Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+ LaneIDArg->getName() + ".addr");
+ Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
+ LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
+ Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+ AlgoVerArg->getName() + ".addr");
+ ArrayType *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // Create a local thread-private variable to host the Reduce list
+ // from a remote lane.
+ Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+ RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+ Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceListAlloca, ReduceListArgType,
+ ReduceListAlloca->getName() + ".ascast");
+ Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
+ Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteLaneOffsetAlloca, LaneIDArgPtrType,
+ RemoteLaneOffsetAlloca->getName() + ".ascast");
+ Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
+ Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteReductionListAlloca, Builder.getPtrTy(),
+ RemoteReductionListAlloca->getName() + ".ascast");
+
+ Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+ Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
+ Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
+ Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
+
+ Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
+ Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
+ Value *RemoteLaneOffset =
+ Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
+ Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
+
+ InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
+
+ // This loop iterates through the list of reduce elements and copies,
+ // element by element, from a remote lane in the warp to RemoteReduceList,
+ // hosted on the thread's stack.
+ emitReductionListCopy(
+ AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+ ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+
+ // The actions to be performed on the Remote Reduce list is dependent
+ // on the algorithm version.
+ //
+ // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+ // LaneId % 2 == 0 && Offset > 0):
+ // do the reduction value aggregation
+ //
+ // The thread local variable Reduce list is mutated in place to host the
+ // reduced data, which is the aggregated value produced from local and
+ // remote lanes.
+ //
+ // Note that AlgoVer is expected to be a constant integer known at compile
+ // time.
+ // When AlgoVer==0, the first conjunction evaluates to true, making
+ // the entire predicate true during compile time.
+ // When AlgoVer==1, the second conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ // When AlgoVer==2, the third conjunction has only the second part to be
+ // evaluated during runtime. Other conjunctions evaluates to false
+ // during compile time.
+ Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+ Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+ Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+ Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+ Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+ Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+ Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+ Value *RemoteOffsetComp =
+ Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+ Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+ Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+ Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+ Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+ emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+ // reduce_function(LocalReduceList, RemoteReduceList)
+ Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReduceList, Builder.getPtrTy());
+ Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RemoteListAddrCast, Builder.getPtrTy());
+ Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
+ ->addFnAttr(Attribute::NoUnwind);
+ Builder.CreateBr(MergeBB);
+
+ emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(MergeBB);
+
+ emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+
+ // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
+ // Reduce list.
+ Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+ Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+ Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
+
+ BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
+ BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
+ BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
+ Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+ emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
+ emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
+ ReductionInfos, RemoteListAddrCast, ReduceList);
+ Builder.CreateBr(CpyMergeBB);
+
+ emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(CpyMergeBB);
+
+ emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
+
+ Builder.CreateRetVoid();
+
+ return SarFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs) {
OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *Int32Ty = Builder.getInt32Ty();
- Type *PtrTy = PointerType::getUnqual(Ctx);
- auto FuncTy =
- FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
Function *LtGCFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage,
"_omp_reduction_list_to_global_copy_func", &M);
- LtGCFunc->setDoesNotRecurse();
+ LtGCFunc->setAttributes(FuncAttrs);
+ LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(2, Attribute::NoUndef);
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
Builder.SetInsertPoint(EntryBlock);
- // Set arg names
- Argument *Arg0 = LtGCFunc->getArg(0);
- Argument *Arg1 = LtGCFunc->getArg(1);
- Argument *Arg2 = LtGCFunc->getArg(2);
- Arg0->setName("buffer_arg");
- Arg1->setName("idx_arg");
- Arg2->setName("reduce_list_arg");
-
- Value *BufferArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
- Value *IdxArgAlloca =
- Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
- Value *ReduceListArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGCFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGCFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
- // FIXME(JAN): Assume a single globalized variable for now, this should be
- // passed in
- Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
- Type *TypeArgs[] = {SingleReductionTy};
- StructType *ReductionsBufferTy =
- StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
- Builder.CreateStore(Arg0, BufferArgAddrCast);
- Builder.CreateStore(Arg1, IdxArgAddrCast);
- Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
- Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
- Value *Idxs[] = {
- Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
- Value *ReduceListArg =
- Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
- // FIXME(Jan): Assume TEK_SCALAR
- for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- // FIXME(Jan): Compute array type
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
- Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, ReduceListArg, 0, En.index());
- Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+ Value *LocalReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Value *BufferArgVal =
+ Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ // Reduce element = LocalReduceList[i]
+ Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+ // Global = Buffer.VD[Idx];
Value *BufferVD =
- Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
- Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
- ReductionsBufferTy, BufferVD, 0, En.index());
- Value *TargetElement = Builder.CreateLoad(RI.ElementType, TargetElementPtr);
- Builder.CreateStore(TargetElement, GlobValPtr);
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
+ Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
+ Builder.CreateStore(TargetElement, GlobVal);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ break;
+ }
+ case EvaluationKindTy::Aggregate:
+ Value *SizeVal =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
+ M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
+ break;
+ }
}
Builder.CreateRetVoid();
@@ -2780,287 +2886,287 @@ static Function *emitListToGlobalCopyFunction(
return LtGCFunc;
}
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy local D to buffer.D[Idx]
-static Function *emitGlobalToListCopyFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ Type *ReductionsBufferTy, AttributeList FuncAttrs) {
OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *Int32Ty = Builder.getInt32Ty();
- Type *PtrTy = PointerType::getUnqual(Ctx);
- auto FuncTy =
- FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
- Function *LtGCFunc =
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGRFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- "_omp_reduction_global_to_list_copy_func", &M);
- LtGCFunc->setDoesNotRecurse();
+ "_omp_reduction_list_to_global_reduce_func", &M);
+ LtGRFunc->setAttributes(FuncAttrs);
+ LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(2, Attribute::NoUndef);
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
Builder.SetInsertPoint(EntryBlock);
- // Set arg names
- Argument *Arg0 = LtGCFunc->getArg(0);
- Argument *Arg1 = LtGCFunc->getArg(1);
- Argument *Arg2 = LtGCFunc->getArg(2);
- Arg0->setName("buffer_arg");
- Arg1->setName("idx_arg");
- Arg2->setName("reduce_list_arg");
-
- Value *BufferArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
- Value *IdxArgAlloca =
- Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
- Value *ReduceListArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGRFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGRFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+ Value *LocalReduceList =
+ Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
- // FIXME(JAN): Assume a single globalized variable for now, this should be
- // passed in
- Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
- Type *TypeArgs[] = {SingleReductionTy};
- StructType *ReductionsBufferTy =
- StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
- Builder.CreateStore(Arg0, BufferArgAddrCast);
- Builder.CreateStore(Arg1, IdxArgAddrCast);
- Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
- Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
- Value *Idxs[] = {
- Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
- Value *ReduceListArg =
- Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
- // FIXME(Jan): Assume TEK_SCALAR
- for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- // FIXME(Jan): Compute array type
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
- Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, ReduceListArg, 0, En.index());
- Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, Builder.getPtrTy(),
+ LocalReduceList->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ for (auto En : enumerate(ReductionInfos)) {
+ Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceListAddrCast,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
Value *BufferVD =
- Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+ // Global = Buffer.VD[Idx];
Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
- ReductionsBufferTy, BufferVD, 0, En.index());
- Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
- Builder.CreateStore(TargetElement, TargetElementPtr);
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+ Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
}
+ // Call reduce_function(GlobalReduceList, ReduceList)
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
+ ->addFnAttr(Attribute::NoUnwind);
Builder.CreateRetVoid();
Builder.restoreIP(OldIP);
- return LtGCFunc;
+ return LtGRFunc;
}
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(GlobPtrs, reduce_data);
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-static Function *emitListToGlobalReduceFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+ AttributeList FuncAttrs) {
OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *Int32Ty = Builder.getInt32Ty();
- Type *PtrTy = PointerType::getUnqual(Ctx);
- auto FuncTy =
- FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
- Function *LtGRFunc =
+ FunctionType *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ Function *LtGCFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- "_omp_reduction_list_to_global_reduce_func", &M);
- LtGRFunc->setDoesNotRecurse();
+ "_omp_reduction_global_to_list_copy_func", &M);
+ LtGCFunc->setAttributes(FuncAttrs);
+ LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGCFunc->addParamAttr(2, Attribute::NoUndef);
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
Builder.SetInsertPoint(EntryBlock);
- // Set arg names
- Argument *Arg0 = LtGRFunc->getArg(0);
- Argument *Arg1 = LtGRFunc->getArg(1);
- Argument *Arg2 = LtGRFunc->getArg(2);
- Arg0->setName("buffer_arg");
- Arg1->setName("idx_arg");
- Arg2->setName("reduce_list_arg");
-
- Value *BufferArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
- Value *IdxArgAlloca =
- Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
- Value *ReduceListArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
- // FIXME(Jan): Compute array type
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
- Value *LocalReduceList =
- Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
-
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGCFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGCFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
- Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
- // FIXME(JAN): Assume a single globalized variable for now, this should be
- // passed in
- Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
- Type *TypeArgs[] = {SingleReductionTy};
- StructType *ReductionsBufferTy =
- StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
- Builder.CreateStore(Arg0, BufferArgAddrCast);
- Builder.CreateStore(Arg1, IdxArgAddrCast);
- Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
- Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
- Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
- // FIXME(Jan): Assume TEK_SCALAR
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *LocalReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
for (auto En : enumerate(ReductionInfos)) {
const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+ auto *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ // Reduce element = LocalReduceList[i]
+ Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, LocalReduceList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+ // Global = Buffer.VD[Idx];
Value *BufferVD =
- Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
- ReductionsBufferTy, BufferVD, 0, En.index());
- Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+ switch (RI.EvaluationKind) {
+ case EvaluationKindTy::Scalar: {
+ Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+ Builder.CreateStore(TargetElement, ElemPtr);
+ break;
+ }
+ case EvaluationKindTy::Complex: {
+ // FIXME(Jan): Complex type
+ break;
+ }
+ case EvaluationKindTy::Aggregate:
+ Value *SizeVal =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+ Builder.CreateMemCpy(
+ ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+ SizeVal, false);
+ break;
+ }
}
- Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
- Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList});
Builder.CreateRetVoid();
Builder.restoreIP(OldIP);
- return LtGRFunc;
+ return LtGCFunc;
}
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(GlobPtrs, reduce_data);
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-static Function *emitGlobalToListReduceFunction(
- Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
- OpenMPIRBuilder &OMPBuilder) {
- IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
+ ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+ Type *ReductionsBufferTy, AttributeList FuncAttrs) {
OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
- Type *VoidTy = Type::getVoidTy(Ctx);
- Type *Int32Ty = Builder.getInt32Ty();
- Type *PtrTy = PointerType::getUnqual(Ctx);
- auto FuncTy =
- FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+ auto *FuncTy = FunctionType::get(
+ Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
Function *LtGRFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage,
"_omp_reduction_global_to_list_reduce_func", &M);
- LtGRFunc->setDoesNotRecurse();
+ LtGRFunc->setAttributes(FuncAttrs);
+ LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+ LtGRFunc->addParamAttr(2, Attribute::NoUndef);
- BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+ BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
Builder.SetInsertPoint(EntryBlock);
- // Set arg names
- Argument *Arg0 = LtGRFunc->getArg(0);
- Argument *Arg1 = LtGRFunc->getArg(1);
- Argument *Arg2 = LtGRFunc->getArg(2);
- Arg0->setName("buffer_arg");
- Arg1->setName("idx_arg");
- Arg2->setName("reduce_list_arg");
-
- Value *BufferArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
- Value *IdxArgAlloca =
- Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
- Value *ReduceListArgAlloca =
- Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
- // FIXME(Jan): Compute array type
- auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+ // Buffer: global reduction buffer.
+ Argument *BufferArg = LtGRFunc->getArg(0);
+ // Idx: index of the buffer.
+ Argument *IdxArg = LtGRFunc->getArg(1);
+ // ReduceList: thread local Reduce list.
+ Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+ Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+ BufferArg->getName() + ".addr");
+ Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+ IdxArg->getName() + ".addr");
+ Value *ReduceListArgAlloca = Builder.CreateAlloca(
+ Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+ ArrayType *RedListArrayTy =
+ ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
Value *LocalReduceList =
Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+ BufferArgAlloca, Builder.getPtrTy(),
+ BufferArgAlloca->getName() + ".ascast");
Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+ IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
- Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
- // FIXME(JAN): Assume a single globalized variable for now, this should be
- // passed in
- Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
- Type *TypeArgs[] = {SingleReductionTy};
- StructType *ReductionsBufferTy =
- StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
- Builder.CreateStore(Arg0, BufferArgAddrCast);
- Builder.CreateStore(Arg1, IdxArgAddrCast);
- Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
- Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
- Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
- // FIXME(Jan): Assume TEK_SCALAR
+ ReduceListArgAlloca, Builder.getPtrTy(),
+ ReduceListArgAlloca->getName() + ".ascast");
+ Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalReduceList, Builder.getPtrTy(),
+ LocalReduceList->getName() + ".ascast");
+
+ Builder.CreateStore(BufferArg, BufferArgAddrCast);
+ Builder.CreateStore(IdxArg, IdxArgAddrCast);
+ Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+ Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+ Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+ Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+ RedListArrayTy, ReductionList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ // Global = Buffer.VD[Idx];
Value *BufferVD =
- Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+ Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
- ReductionsBufferTy, BufferVD, 0, En.index());
+ ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
}
- Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
- Builder.CreateCall(ReduceFn, {ReduceList, LocalReduceListAddrCast});
+ // Call reduce_function(ReduceList, GlobalReduceList)
+ Value *ReduceList =
+ Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+ Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
+ ->addFnAttr(Attribute::NoUnwind);
Builder.CreateRetVoid();
Builder.restoreIP(OldIP);
return LtGRFunc;
}
-static Function *getFreshReductionFunc(Module &M) {
- Type *VoidTy = Type::getVoidTy(M.getContext());
- Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
- auto *FuncTy =
- FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
- return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
- ".omp.reduction.func", &M);
-}
+std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
+ std::string Suffix =
+ createPlatformSpecificName({"omp", "reduction", "reduction_func"});
+ return (Name + Suffix).str();
+}
+
+Function *OpenMPIRBuilder::createReductionFunction(
+ StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, bool IsGPU,
+ ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
+ auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
+ {Builder.getPtrTy(), Builder.getPtrTy()},
+ /* IsVarArg */ false);
+ std::string Name = getReductionFuncName(ReducerName);
+ auto *ReductionFunc =
+ Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
+ ReductionFunc->setAttributes(FuncAttrs);
+ ReductionFunc->addParamAttr(0, Attribute::NoUndef);
+ ReductionFunc->addParamAttr(1, Attribute::NoUndef);
+ BasicBlock *EntryBB =
+ BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
+ Builder.SetInsertPoint(EntryBB);
-static void populateReductionFunction(
- Function *ReductionFunc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- IRBuilder<> &Builder, bool IsGPU) {
- Module *Module = ReductionFunc->getParent();
- BasicBlock *ReductionFuncBlock =
- BasicBlock::Create(Module->getContext(), "", ReductionFunc);
- Builder.SetInsertPoint(ReductionFuncBlock);
Value *LHSArrayPtr = nullptr;
Value *RHSArrayPtr = nullptr;
if (IsGPU) {
@@ -3076,10 +3182,10 @@ static void populateReductionFunction(
Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
Value *RHSAlloca =
Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
- Value *LHSAddrCast =
- Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
- Value *RHSAddrCast =
- Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
+ Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+ Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
Builder.CreateStore(Arg0, LHSAddrCast);
Builder.CreateStore(Arg1, RHSAddrCast);
LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
@@ -3089,30 +3195,65 @@ static void populateReductionFunction(
RHSArrayPtr = ReductionFunc->getArg(1);
}
- unsigned NumReductions = ReductionInfos.size();
- Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
-
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+ SmallVector<Value *> LHSPtrs, RHSPtrs;
for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, LHSArrayPtr, 0, En.index());
- Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
- Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LHSI8Ptr, RI.Variable->getType());
- Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
- Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, RHSArrayPtr, 0, En.index());
+ const ReductionInfo &RI = En.value();
+ Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, RHSArrayPtr,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
- Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- RHSI8Ptr, RI.PrivateVariable->getType());
- Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
- Value *Reduced;
- Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
- if (!Builder.GetInsertBlock())
- return;
- Builder.CreateStore(Reduced, LHSPtr);
+ Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType(),
+ RHSI8Ptr->getName() + ".ascast");
+
+ Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, LHSArrayPtr,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ LHSPtrs.emplace_back(LHS);
+ RHSPtrs.emplace_back(RHS);
+ } else {
+ LHS = Builder.CreateLoad(RI.ElementType, LHS);
+ RHS = Builder.CreateLoad(RI.ElementType, RHS);
+ Value *Reduced;
+ RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+ if (!Builder.GetInsertBlock())
+ return ReductionFunc;
+ Builder.CreateStore(Reduced, LHS);
+ }
}
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang)
+ for (auto En : enumerate(ReductionInfos)) {
+ unsigned Index = En.index();
+ const ReductionInfo &RI = En.value();
+ Value *LHSFixupPtr, *RHSFixupPtr;
+ Builder.restoreIP(RI.ReductionGenClang(
+ Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
+
+ // Fix the CallBack code genereated to use the correct Values for the LHS
+ // and RHS
+ LHSFixupPtr->replaceUsesWithIf(
+ LHSPtrs[Index], [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ RHSFixupPtr->replaceUsesWithIf(
+ RHSPtrs[Index], [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ }
+
Builder.CreateRetVoid();
+ return ReductionFunc;
}
static void
@@ -3122,7 +3263,8 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
(void)RI;
assert(RI.Variable && "expected non-null variable");
assert(RI.PrivateVariable && "expected non-null private variable");
- assert(RI.ReductionGen && "expected non-null reduction generator callback");
+ assert((RI.ReductionGen || RI.ReductionGenClang) &&
+ "expected non-null reduction generator callback");
// JAN: Skip this assertion for GPU, address spaces are present
if (!IsGPU) {
assert(
@@ -3137,73 +3279,103 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
const LocationDescription &Loc, InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
- bool IsTeamsReduction, bool HasDistribute) {
- checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
- LLVMContext &Ctx = M.getContext();
+ InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+ ReductionGenCBTy ReductionGenCBTy, std::optional<omp::GV> GridValue,
+ unsigned ReductionBufNum, Value *SrcLocInfo) {
if (!updateToLocation(Loc))
return InsertPointTy();
+ Builder.restoreIP(CodeGenIP);
+ checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+ LLVMContext &Ctx = M.getContext();
+
+ // Source location for the ident struct
+ if (!SrcLocInfo) {
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+ }
if (ReductionInfos.size() == 0)
return Builder.saveIP();
- assert(ReductionInfos.size() == 1 && "More than one reduction variable");
+ Function *CurFunc = Builder.GetInsertBlock()->getParent();
+ AttributeList FuncAttrs;
+ AttrBuilder AttrBldr(Ctx);
+ for (auto Attr : CurFunc->getAttributes().getFnAttrs())
+ AttrBldr.addAttribute(Attr);
+ AttrBldr.removeAttribute(Attribute::OptimizeNone);
+ FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
- // Copied code from createReductions
- BasicBlock *InsertBlock = Loc.IP.getBlock();
- BasicBlock *ContinuationBlock =
- InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
- InsertBlock->getTerminator()->eraseFromParent();
- Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+ // Set the grid value in the config needed for lowering later on
+ if (GridValue.has_value())
+ Config.setGridValue(GridValue.value());
+ else
+ Config.setGridValue(getGridValue(T, Config.TargetFeatures));
Function *ReductionFunc = nullptr;
if (GLOBAL_ReductionFunc) {
ReductionFunc = GLOBAL_ReductionFunc;
} else {
- ReductionFunc = getFreshReductionFunc(M);
- GLOBAL_ReductionFunc = ReductionFunc;
- InsertPointTy CurIP = Builder.saveIP();
- populateReductionFunction(ReductionFunc, ReductionInfos, Builder, true);
- Builder.restoreIP(CurIP);
+ CodeGenIP = Builder.saveIP();
+ ReductionFunc = createReductionFunction(
+ Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, true,
+ ReductionGenCBTy, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
}
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
Value *RTLoc =
- getOrCreateIdent(SrcLocStr, SrcLocStrSize, llvm::omp::IdentFlag(0), 0);
+ getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
- // 1. Build a list of reduction variables
+ // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
+ // RedList, shuffle_reduce_func, interwarp_copy_func);
+ // or
+ // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+ Value *Res;
+
+ // 1. Build a list of reduction variables.
+ // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
auto Size = ReductionInfos.size();
- // FIXME(JAN): skipping variably modified type storage for array size
Type *PtrTy = PointerType::getUnqual(Ctx);
Type *RedArrayTy = ArrayType::get(PtrTy, Size);
- InsertPointTy CurIP = Builder.saveIP();
+ CodeGenIP = Builder.saveIP();
Builder.restoreIP(AllocaIP);
Value *ReductionListAlloca =
Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
- Value *ReductionList =
- Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionListAlloca, PtrTy);
- Builder.restoreIP(CurIP);
+ Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
+ Builder.restoreIP(CodeGenIP);
+ Type *IndexTy = Builder.getIndexTy(
+ M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
for (auto En : enumerate(ReductionInfos)) {
const ReductionInfo &RI = En.value();
- Value *ElemPtr = Builder.CreateConstGEP2_64(RedArrayTy, ReductionList, 0,
- En.index(), "elem_ptr");
+ Value *ElemPtr = Builder.CreateInBoundsGEP(
+ RedArrayTy, ReductionList,
+ {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
Value *CastElem =
Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
Builder.CreateStore(CastElem, ElemPtr);
}
- CurIP = Builder.saveIP();
- Function *SarFunc = emitShuffleAndReduceFunction(M, Loc, ReductionInfos,
- ReductionFunc, *this);
- Function *WcFunc = emitInterWarpCopyFunction(M, Loc, ReductionInfos, *this);
- Builder.restoreIP(CurIP);
+ CodeGenIP = Builder.saveIP();
+ Function *SarFunc =
+ emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+ Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
- Value *RL =
- Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
- Value *ReductionDataSize =
- getTypeSizeInBytesValue(Builder, M, ReductionInfos.begin()->ElementType);
+ Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
- Value *Res;
+ unsigned MaxDataSize = 0;
+ SmallVector<Type *> ReductionTypeArgs;
+ for (auto En : enumerate(ReductionInfos)) {
+ auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
+ if (Size > MaxDataSize)
+ MaxDataSize = Size;
+ ReductionTypeArgs.emplace_back(En.value().ElementType);
+ }
+ Value *ReductionDataSize =
+ Builder.getInt64(MaxDataSize * ReductionInfos.size());
if (!IsTeamsReduction) {
Value *SarFuncCast =
Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
@@ -3214,25 +3386,27 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
Res = Builder.CreateCall(Pv2Ptr, Args);
} else {
- CurIP = Builder.saveIP();
- Function *LtGCFunc =
- emitListToGlobalCopyFunction(M, Loc, ReductionInfos, *this);
- Function *LtGRFunc = emitListToGlobalReduceFunction(M, Loc, ReductionInfos,
- ReductionFunc, *this);
- Function *GtLCFunc =
- emitGlobalToListCopyFunction(M, Loc, ReductionInfos, *this);
- Function *GtLRFunc = emitGlobalToListReduceFunction(M, Loc, ReductionInfos,
- ReductionFunc, *this);
- Builder.restoreIP(CurIP);
-
+ CodeGenIP = Builder.saveIP();
+ StructType *ReductionsBufferTy = StructType::create(
+ Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+ Function *LtGCFunc = emitListToGlobalCopyFunction(
+ ReductionInfos, ReductionsBufferTy, FuncAttrs);
+ Function *LtGRFunc = emitListToGlobalReduceFunction(
+ ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+ Function *GtLCFunc = emitGlobalToListCopyFunction(
+ ReductionInfos, ReductionsBufferTy, FuncAttrs);
+ Function *GtLRFunc = emitGlobalToListReduceFunction(
+ ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+ Builder.restoreIP(CodeGenIP);
- Value *KernelTeamsReductionPtr = Builder.CreateCall(RedFixedBuferFn, {});
+ Value *KernelTeamsReductionPtr = Builder.CreateCall(
+ RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
Value *Args3[] = {RTLoc,
KernelTeamsReductionPtr,
- Builder.getInt32(1024),
+ Builder.getInt32(ReductionBufNum),
ReductionDataSize,
RL,
SarFunc,
@@ -3247,32 +3421,50 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
Res = Builder.CreateCall(TeamsReduceFn, Args3);
}
- if (IsTeamsReduction || !HasDistribute) {
- Function *CurFunc = Builder.GetInsertBlock()->getParent();
- BasicBlock *ExitBB =
- BasicBlock::Create(Ctx, ".omp.reduction.done", CurFunc);
- BasicBlock *ThenBB =
- BasicBlock::Create(Ctx, ".omp.reduction.then", CurFunc);
- Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
- Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+ // 5. Build if (res == 1)
+ BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
+ BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
+ Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+ Builder.CreateCondBr(Cond, ThenBB, ExitBB);
- Builder.SetInsertPoint(ThenBB);
- for (auto En : enumerate(ReductionInfos)) {
- const ReductionInfo &RI = En.value();
- Value *InputVal = Builder.CreateLoad(RI.ElementType, RI.Variable);
- Value *RedVal = Builder.CreateLoad(
- RI.ElementType, Builder.CreatePointerBitCastOrAddrSpaceCast(
- RI.PrivateVariable, PtrTy));
- Value *sum;
- Builder.restoreIP(
- RI.ReductionGen(Builder.saveIP(), InputVal, RedVal, sum));
- Builder.CreateStore(sum, RI.Variable);
- Builder.CreateBr(ExitBB);
+ // 6. Build then branch: where we have reduced values in the master
+ // thread in each team.
+ // __kmpc_end_reduce{_nowait}(<gtid>);
+ // break;
+ emitBlock(ThenBB, CurFunc);
+
+ // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *LHS = RI.Variable;
+ Value *RHS =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+
+ if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ Value *LHSPtr, *RHSPtr;
+ Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
+ &LHSPtr, &RHSPtr, CurFunc));
+
+ // Fix the CallBack code genereated to use the correct Values for the LHS
+ // and RHS
+ LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
+ return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+ ReductionFunc;
+ });
+ } else {
+ // LHS = Builder.CreateLoad(LHS);
+ // LHS = Builder.CreateLoad(LHS);
+ // Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS));
}
- Builder.SetInsertPoint(ExitBB);
}
- Builder.CreateBr(ContinuationBlock);
- Builder.SetInsertPoint(ContinuationBlock);
+ emitBlock(ExitBB, CurFunc);
+
+ Config.setEmitLLVMUsed();
+
return Builder.saveIP();
}
@@ -3281,8 +3473,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef,
bool IsTeamsReduction, bool HasDistribute) {
if (Config.isGPU())
- return createReductionsGPU(Loc, AllocaIP, ReductionInfos, IsNoWait,
- IsTeamsReduction, HasDistribute);
+ return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
+ IsNoWait, IsTeamsReduction, HasDistribute);
checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
@@ -3320,10 +3512,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Module *Module = Func->getParent();
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- bool CanGenerateAtomic =
- llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
- return RI.AtomicReductionGen;
- });
+ bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
+ return RI.AtomicReductionGen;
+ });
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
CanGenerateAtomic
? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
@@ -3333,7 +3524,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const DataLayout &DL = Module->getDataLayout();
unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
- Function *ReductionFunc = getFreshReductionFunc(*Module);
+ Function *ReductionFunc = createReductionFunction(
+ Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos);
Value *Lock = getOMPCriticalRegionLock(".reduction");
Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -3412,36 +3604,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
// Populate the outlined reduction function using the elementwise reduction
// function. Partial values are extracted from the type-erased array of
// pointers to private variables.
- BasicBlock *ReductionFuncBlock =
- BasicBlock::Create(Module->getContext(), "", ReductionFunc);
- Builder.SetInsertPoint(ReductionFuncBlock);
- Value *LHSArrayPtr = ReductionFunc->getArg(0);
- Value *RHSArrayPtr = ReductionFunc->getArg(1);
-
- for (auto En : enumerate(ReductionInfos)) {
- const ReductionInfo &RI = En.value();
- Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, LHSArrayPtr, 0, En.index());
- Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
- Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
- Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
- Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, RHSArrayPtr, 0, En.index());
- Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
- Value *RHSPtr =
- Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
- Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
- Value *Reduced;
- Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
- if (!Builder.GetInsertBlock())
- return InsertPointTy();
- // store is inside of the reduction region when using by-ref
- if (!IsByRef)
- Builder.CreateStore(Reduced, LHSPtr);
- }
- Builder.CreateRetVoid();
-
- populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
+ // populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
}
@@ -5799,7 +5962,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
Ident,
DynamicEnvironment,
});
- Twine KernelEnvironmentName = KernelName + "_kernel_environment";
+ std::string KernelEnvironmentName =
+ (KernelName + "_kernel_environment").str();
GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
KernelEnvironmentInitializer, KernelEnvironmentName,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 646d0ed73084ad..84b6c42b988517 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -847,14 +847,17 @@ static void collectReductionInfo(
// Collect the reduction information.
reductionInfos.reserve(numReductions);
for (unsigned i = 0; i < numReductions; ++i) {
- llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;
+ llvm::OpenMPIRBuilder::AtomicReductionGenCB atomicGen = nullptr;
if (owningAtomicReductionGens[i])
atomicGen = owningAtomicReductionGens[i];
llvm::Value *variable =
moduleTranslation.lookupValue(loop.getReductionVars()[i]);
- reductionInfos.push_back(
- {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
- privateReductionVariables[i], owningReductionGens[i], atomicGen});
+ reductionInfos.push_back(llvm::OpenMPIRBuilder::ReductionInfo(
+ moduleTranslation.convertType(reductionDecls[i].getType()), variable,
+ privateReductionVariables[i],
+ /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ owningReductionGens[i],
+ /*ReductionGenClang=*/nullptr, atomicGen));
}
}
>From 4ddbd7bdec882a689924fb5d3a690026121522e2 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 7 Feb 2024 14:23:41 +0000
Subject: [PATCH 03/18] Remove ReductionInfoManager. Add assert failures for
complex data types.
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 47 -------------------
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 12 +++--
2 files changed, 8 insertions(+), 51 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 4f2207cc700fbf..0de05ffd6dac36 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1350,50 +1350,6 @@ class OpenMPIRBuilder {
Value *ScratchpadWidth = nullptr;
};
- /// A class that manages the reduction info to facilitate lowering of
- /// reductions at multiple levels of parallelism. For example handling teams
- /// and parallel reductions on GPUs
- class ReductionInfoManager {
- private:
- SmallVector<ReductionInfo> ReductionInfos;
- std::optional<InsertPointTy> PrivateVarAllocaIP;
-
- public:
- ReductionInfoManager(){};
- void clear() {
- ReductionInfos.clear();
- PrivateVarAllocaIP.reset();
- }
-
- Value *allocatePrivateReductionVar(IRBuilderBase &builder,
- InsertPointTy &allocaIP, Type *VarType) {
- Type *ptrTy = PointerType::getUnqual(builder.getContext());
- Value *var = builder.CreateAlloca(VarType);
- var->setName("private_redvar");
- Value *castVar = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
- ReductionInfos.push_back(ReductionInfo(castVar));
- return castVar;
- }
-
- ReductionInfo getReductionInfo(unsigned Index) {
- return ReductionInfos[Index];
- }
- ReductionInfo setReductionInfo(unsigned Index, ReductionInfo &RI) {
- return ReductionInfos[Index] = RI;
- }
- Value *getPrivateReductionVariable(unsigned Index) {
- return ReductionInfos[Index].PrivateVariable;
- }
- SmallVector<ReductionInfo> &getReductionInfos() { return ReductionInfos; }
-
- bool hasPrivateVarAllocaIP() { return PrivateVarAllocaIP.has_value(); }
- InsertPointTy getPrivateVarAllocaIP() {
- assert(PrivateVarAllocaIP.has_value() && "AllocaIP not set");
- return *PrivateVarAllocaIP;
- }
- void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
- };
-
/// Supporting functions for Reductions CodeGen.
private:
/// Emit the llvm.used metadata.
@@ -2090,9 +2046,6 @@ class OpenMPIRBuilder {
/// Info manager to keep track of target regions.
OffloadEntriesInfoManager OffloadInfoManager;
- /// Info manager to keep track of reduction information;
- ReductionInfoManager RIManager;
-
/// The target triple of the underlying module.
const Triple T;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 435af68e344a8a..5b79baba1b3e1a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2364,6 +2364,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
break;
}
case EvaluationKindTy::Complex: {
+ assert(false && "Complex data type not handled");
break;
}
case EvaluationKindTy::Aggregate: {
@@ -2869,9 +2870,10 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
break;
}
case EvaluationKindTy::Complex: {
+ assert(false && "Complex data type not handled");
break;
}
- case EvaluationKindTy::Aggregate:
+ case EvaluationKindTy::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -2879,6 +2881,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
break;
}
+ }
}
Builder.CreateRetVoid();
@@ -3043,10 +3046,10 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
break;
}
case EvaluationKindTy::Complex: {
- // FIXME(Jan): Complex type
+ assert(false && "Complex data type not handled");
break;
}
- case EvaluationKindTy::Aggregate:
+ case EvaluationKindTy::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -3055,6 +3058,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
SizeVal, false);
break;
}
+ }
}
Builder.CreateRetVoid();
@@ -3265,7 +3269,6 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
assert(RI.PrivateVariable && "expected non-null private variable");
assert((RI.ReductionGen || RI.ReductionGenClang) &&
"expected non-null reduction generator callback");
- // JAN: Skip this assertion for GPU, address spaces are present
if (!IsGPU) {
assert(
RI.Variable->getType() == RI.PrivateVariable->getType() &&
@@ -4124,6 +4127,7 @@ static void createTargetLoopWorkshareCall(
// FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
// not be the right way to fix it, but this works for now.
if (OMPBuilder->Config.isGPU()) {
+ assert(false && "Akash");
if (LoopType != WorksharingLoopType::DistributeStaticLoop)
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
LLVMContext &Ctx = M.getContext();
>From e9f3bc242f6e3964f6cfc918be06a955f3a0525f Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 7 Feb 2024 14:27:15 +0000
Subject: [PATCH 04/18] Remove unintended debug assert added in previous
commit.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5b79baba1b3e1a..c48b7a3392bd06 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4127,7 +4127,6 @@ static void createTargetLoopWorkshareCall(
// FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
// not be the right way to fix it, but this works for now.
if (OMPBuilder->Config.isGPU()) {
- assert(false && "Akash");
if (LoopType != WorksharingLoopType::DistributeStaticLoop)
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
LLVMContext &Ctx = M.getContext();
>From 3ed762d842215f16aafd6c1cd37e9a5ae2675e55 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 12:48:38 +0000
Subject: [PATCH 05/18] Remove unintended code from MLIRTranslation.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 52 +++++++++----------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 84b6c42b988517..7657f28c9539a0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -852,12 +852,12 @@ static void collectReductionInfo(
atomicGen = owningAtomicReductionGens[i];
llvm::Value *variable =
moduleTranslation.lookupValue(loop.getReductionVars()[i]);
- reductionInfos.push_back(llvm::OpenMPIRBuilder::ReductionInfo(
- moduleTranslation.convertType(reductionDecls[i].getType()), variable,
- privateReductionVariables[i],
- /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
- owningReductionGens[i],
- /*ReductionGenClang=*/nullptr, atomicGen));
+ reductionInfos.push_back(
+ {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
+ privateReductionVariables[i],
+ /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ owningReductionGens[i],
+ /*ReductionGenClang=*/nullptr, atomicGen});
}
}
@@ -3131,26 +3131,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
moduleTranslation);
return failure();
})
- .Case("omp.requires",
- [&](Attribute attr) {
- if (auto requiresAttr =
- attr.dyn_cast<omp::ClauseRequiresAttr>()) {
- using Requires = omp::ClauseRequires;
- Requires flags = requiresAttr.getValue();
- llvm::OpenMPIRBuilderConfig &config =
- moduleTranslation.getOpenMPBuilder()->Config;
- config.setHasRequiresReverseOffload(
- bitEnumContainsAll(flags, Requires::reverse_offload));
- config.setHasRequiresUnifiedAddress(
- bitEnumContainsAll(flags, Requires::unified_address));
- config.setHasRequiresUnifiedSharedMemory(
- bitEnumContainsAll(flags, Requires::unified_shared_memory));
- config.setHasRequiresDynamicAllocators(
- bitEnumContainsAll(flags, Requires::dynamic_allocators));
- return success();
- }
- return failure();
- })
+ .Case(
+ "omp.requires",
+ [&](Attribute attr) {
+ if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+ using Requires = omp::ClauseRequires;
+ Requires flags = requiresAttr.getValue();
+ llvm::OpenMPIRBuilderConfig &config =
+ moduleTranslation.getOpenMPBuilder()->Config;
+ config.setHasRequiresReverseOffload(
+ bitEnumContainsAll(flags, Requires::reverse_offload));
+ config.setHasRequiresUnifiedAddress(
+ bitEnumContainsAll(flags, Requires::unified_address));
+ config.setHasRequiresUnifiedSharedMemory(
+ bitEnumContainsAll(flags, Requires::unified_shared_memory));
+ config.setHasRequiresDynamicAllocators(
+ bitEnumContainsAll(flags, Requires::dynamic_allocators));
+ return convertRequiresAttr(*op, requiresAttr, moduleTranslation);
+ }
+ return failure();
+ })
.Default([](Attribute) {
// Fall through for omp attributes that do not require lowering.
return success();
>From b16da894c2c8b7b8298c8a5da1cec516849359bd Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 13:48:43 +0000
Subject: [PATCH 06/18] Remove unintended code for WSLoop
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 18 ++----------------
1 file changed, 2 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c48b7a3392bd06..712e5f7e00ff98 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4120,24 +4120,10 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
- Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
+ Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+ Type *TripCountTy = TripCount->getType();
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
- Value *TripCount = TripCountOrig;
- // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
- // not be the right way to fix it, but this works for now.
- if (OMPBuilder->Config.isGPU()) {
- if (LoopType != WorksharingLoopType::DistributeStaticLoop)
- Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
- LLVMContext &Ctx = M.getContext();
- Type *IVTy = TripCountOrig->getType();
- Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
- ? Type::getInt32Ty(Ctx)
- : Type::getInt64Ty(Ctx);
- Constant *One = ConstantInt::get(InternalIVTy, 1);
- TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
- }
- Type *TripCountTy = TripCount->getType();
FunctionCallee RTLFn =
getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
SmallVector<Value *, 8> RealArgs;
>From 1f3be07ac6d75068c6e602dac9bc28700c73870f Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 15:02:45 +0000
Subject: [PATCH 07/18] Remove more non-essential code to this patch.
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 5 ---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 34 +++++--------------
2 files changed, 9 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 0de05ffd6dac36..4065ddfbe230ff 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -106,11 +106,6 @@ class OpenMPIRBuilderConfig {
/// Flag for specifying if offloading is mandatory.
std::optional<bool> OpenMPOffloadMandatory;
- /// Name of the target processor.
- StringRef TargetCPU;
- /// String representation of the target processor's features.
- StringRef TargetFeatures;
-
/// First separator used between the initial two parts of a name.
std::optional<StringRef> FirstSeparator;
/// Separator used between all of the rest consecutive parts of s name
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 712e5f7e00ff98..6c0e1070e705ef 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -147,22 +147,13 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
Function *GLOBAL_ReductionFunc = nullptr;
-static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
- if (T.isAMDGPU()) {
- if (Features.count("+wavefrontsize64"))
- return omp::getAMDGPUGridValues<64>();
- return omp::getAMDGPUGridValues<32>();
- }
- if (T.isNVPTX())
- return omp::NVPTXGridValues;
- llvm_unreachable("No grid value available for this architecture!");
-}
-
static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
if (T.isAMDGPU()) {
StringRef Features =
Kernel->getFnAttribute("target-features").getValueAsString();
- return getGridValue(T, Features);
+ if (Features.count("+wavefrontsize64"))
+ return omp::getAMDGPUGridValues<64>();
+ return omp::getAMDGPUGridValues<32>();
}
if (T.isNVPTX())
return omp::NVPTXGridValues;
@@ -3310,12 +3301,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
AttrBldr.removeAttribute(Attribute::OptimizeNone);
FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
- // Set the grid value in the config needed for lowering later on
- if (GridValue.has_value())
- Config.setGridValue(GridValue.value());
- else
- Config.setGridValue(getGridValue(T, Config.TargetFeatures));
-
Function *ReductionFunc = nullptr;
if (GLOBAL_ReductionFunc) {
ReductionFunc = GLOBAL_ReductionFunc;
@@ -3327,6 +3312,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
Builder.restoreIP(CodeGenIP);
}
+ // Set the grid value in the config needed for lowering later on
+ if (GridValue.has_value())
+ Config.setGridValue(GridValue.value());
+ else
+ Config.setGridValue(getGridValue(T, ReductionFunc));
+
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
Value *RTLoc =
@@ -5884,9 +5875,6 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
Function *Kernel = Builder.GetInsertBlock()->getParent();
- // Set the grid value in the config needed for lowering later on
- Config.setGridValue(getGridValue(T, Config.TargetFeatures));
-
// Manifest the launch configuration in the metadata matching the kernel
// environment.
if (MinTeamsVal > 1 || MaxTeamsVal > 0)
@@ -6140,10 +6128,6 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
if (T.isAMDGCN())
OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
- if (!Config.TargetCPU.empty())
- OutlinedFn->addFnAttr("target-cpu", Config.TargetCPU);
- if (!Config.TargetFeatures.empty())
- OutlinedFn->addFnAttr("target-features", Config.TargetFeatures);
}
}
>From 483a63bfab3017a4e194d55128364dcd7316fcf2 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 13:38:53 +0000
Subject: [PATCH 08/18] Update OMPIRBuilderTest.cpp.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 18 +++++++-----------
.../unittests/Frontend/OpenMPIRBuilderTest.cpp | 16 ++++++++++++----
2 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6c0e1070e705ef..41a5e904bf4394 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3200,7 +3200,7 @@ Function *OpenMPIRBuilder::createReductionFunction(
RedArrayTy, RHSArrayPtr,
{ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
- Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
RHSI8Ptr, RI.PrivateVariable->getType(),
RHSI8Ptr->getName() + ".ascast");
@@ -3208,20 +3208,20 @@ Function *OpenMPIRBuilder::createReductionFunction(
RedArrayTy, LHSArrayPtr,
{ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
- Value *LHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
- LHSPtrs.emplace_back(LHS);
- RHSPtrs.emplace_back(RHS);
+ LHSPtrs.emplace_back(LHSPtr);
+ RHSPtrs.emplace_back(RHSPtr);
} else {
- LHS = Builder.CreateLoad(RI.ElementType, LHS);
- RHS = Builder.CreateLoad(RI.ElementType, RHS);
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
Value *Reduced;
RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
if (!Builder.GetInsertBlock())
return ReductionFunc;
- Builder.CreateStore(Reduced, LHS);
+ Builder.CreateStore(Reduced, LHSPtr);
}
}
@@ -3595,10 +3595,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Builder.CreateUnreachable();
}
- // Populate the outlined reduction function using the elementwise reduction
- // function. Partial values are extracted from the type-erased array of
- // pointers to private variables.
- // populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
}
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5c415cadcd686c..5cf1dbd0620f87 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4980,8 +4980,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
Builder.restoreIP(AfterIP);
OpenMPIRBuilder::ReductionInfo ReductionInfos[] = {
- {SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction},
- {XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}};
+ {SumType, SumReduced, SumPrivatized,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction},
+ {XorType, XorReduced, XorPrivatized,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos);
@@ -5232,10 +5236,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
OMPBuilder.createReductions(
FirstBodyIP, FirstBodyAllocaIP,
- {{SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction}});
+ {{SumType, SumReduced, SumPrivatized,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction}});
OMPBuilder.createReductions(
SecondBodyIP, SecondBodyAllocaIP,
- {{XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}});
+ {{XorType, XorReduced, XorPrivatized,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}});
Builder.restoreIP(AfterIP);
Builder.CreateRetVoid();
>From 14b5286b43780572ea02489e92d46349d545e6dc Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 15:48:55 +0000
Subject: [PATCH 09/18] Fix cpu reductions.
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 1 -
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 103 +++++++++++++-----
2 files changed, 74 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 4065ddfbe230ff..6383bc1775a51e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1574,7 +1574,6 @@ class OpenMPIRBuilder {
/// \return The reduction function.
Function *createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
- bool IsGpu = false,
ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
AttributeList FuncAttrs = {});
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 41a5e904bf4394..f8a8e70e4c3f4a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3147,13 +3147,13 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
}
Function *OpenMPIRBuilder::createReductionFunction(
- StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, bool IsGPU,
+ StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
{Builder.getPtrTy(), Builder.getPtrTy()},
/* IsVarArg */ false);
std::string Name = getReductionFuncName(ReducerName);
- auto *ReductionFunc =
+ Function *ReductionFunc =
Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
ReductionFunc->setAttributes(FuncAttrs);
ReductionFunc->addParamAttr(0, Attribute::NoUndef);
@@ -3162,33 +3162,27 @@ Function *OpenMPIRBuilder::createReductionFunction(
BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
Builder.SetInsertPoint(EntryBB);
+ // Need to alloca memory here and deal with the pointers before getting
+ // LHS/RHS pointers out
Value *LHSArrayPtr = nullptr;
Value *RHSArrayPtr = nullptr;
- if (IsGPU) {
- // Need to alloca memory here and deal with the pointers before getting
- // LHS/RHS pointers out
- //
- Argument *Arg0 = ReductionFunc->getArg(0);
- Argument *Arg1 = ReductionFunc->getArg(1);
- Type *Arg0Type = Arg0->getType();
- Type *Arg1Type = Arg1->getType();
-
- Value *LHSAlloca =
- Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
- Value *RHSAlloca =
- Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
- Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
- Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
- RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
- Builder.CreateStore(Arg0, LHSAddrCast);
- Builder.CreateStore(Arg1, RHSAddrCast);
- LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
- RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
- } else {
- LHSArrayPtr = ReductionFunc->getArg(0);
- RHSArrayPtr = ReductionFunc->getArg(1);
- }
+ Argument *Arg0 = ReductionFunc->getArg(0);
+ Argument *Arg1 = ReductionFunc->getArg(1);
+ Type *Arg0Type = Arg0->getType();
+ Type *Arg1Type = Arg1->getType();
+
+ Value *LHSAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Value *RHSAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+ Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
+ Builder.CreateStore(Arg0, LHSAddrCast);
+ Builder.CreateStore(Arg1, RHSAddrCast);
+ LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+ RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
Type *IndexTy = Builder.getIndexTy(
@@ -3462,6 +3456,54 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
return Builder.saveIP();
}
+static Function *getFreshReductionFunc(Module &M) {
+ Type *VoidTy = Type::getVoidTy(M.getContext());
+ Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
+ auto *FuncTy =
+ FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
+ return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+ ".omp.reduction.func", &M);
+}
+
+static void populateReductionFunction(
+ Function *ReductionFunc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ IRBuilder<> &Builder) {
+ Module *Module = ReductionFunc->getParent();
+ BasicBlock *ReductionFuncBlock =
+ BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+ Builder.SetInsertPoint(ReductionFuncBlock);
+ Value *LHSArrayPtr = nullptr;
+ Value *RHSArrayPtr = nullptr;
+ LHSArrayPtr = ReductionFunc->getArg(0);
+ RHSArrayPtr = ReductionFunc->getArg(1);
+
+ unsigned NumReductions = ReductionInfos.size();
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, LHSArrayPtr, 0, En.index());
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType());
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, RHSArrayPtr, 0, En.index());
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType());
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+ if (!Builder.GetInsertBlock())
+ return;
+ Builder.CreateStore(Reduced, LHSPtr);
+ }
+ Builder.CreateRetVoid();
+}
+
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const LocationDescription &Loc, InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef,
@@ -3518,8 +3560,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const DataLayout &DL = Module->getDataLayout();
unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
- Function *ReductionFunc = createReductionFunction(
- Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos);
+ Function *ReductionFunc = getFreshReductionFunc(M);
Value *Lock = getOMPCriticalRegionLock(".reduction");
Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -3595,6 +3636,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Builder.CreateUnreachable();
}
+ // Populate the outlined reduction function using the elementwise reduction
+ // function. Partial values are extracted from the type-erased array of
+ // pointers to private variables.
+ populateReductionFunction(ReductionFunc, ReductionInfos, Builder);
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
}
>From 6228edd46016c301f9de14fa16b1100194a27db7 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 16:11:26 +0000
Subject: [PATCH 10/18] Fix nit issue,
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f8a8e70e4c3f4a..0ab34c30ba1c1d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3301,7 +3301,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
} else {
CodeGenIP = Builder.saveIP();
ReductionFunc = createReductionFunction(
- Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, true,
+ Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos,
ReductionGenCBTy, FuncAttrs);
Builder.restoreIP(CodeGenIP);
}
>From b3749a6f5772500ba29d560a882043c3cd18846d Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 17:45:07 +0000
Subject: [PATCH 11/18] Fix OpenMPIRBuilderTest build failure.
---
llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5cf1dbd0620f87..de4b6d6c226a81 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4986,7 +4986,7 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
{XorType, XorReduced, XorPrivatized,
/*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
-
+ OMPBuilder.Config.setIsGPU(false);
OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos);
Builder.restoreIP(AfterIP);
@@ -5234,6 +5234,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
/* NumThreads */ nullptr, OMP_PROC_BIND_default,
/* IsCancellable */ false);
+ OMPBuilder.Config.setIsGPU(false);
OMPBuilder.createReductions(
FirstBodyIP, FirstBodyAllocaIP,
{{SumType, SumReduced, SumPrivatized,
>From a6e73f81185bd46b43a642a4f13bc02c848d906e Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Thu, 15 Feb 2024 14:17:27 +0000
Subject: [PATCH 12/18] Fix Clang CI build error.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 93b46da1d55458..919cc048ea6d52 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1656,9 +1656,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
-#ifndef NDEBUG
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
ASTContext &C = CGM.getContext();
>From e32eb4fb43af5668ce824ca68543d5c689d191ad Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 12 Mar 2024 17:21:55 +0000
Subject: [PATCH 13/18] Add support for Complex variable types.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 48 +++++++++++++++++++++--
1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0ab34c30ba1c1d..f538296804b9e9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2355,7 +2355,21 @@ void OpenMPIRBuilder::emitReductionListCopy(
break;
}
case EvaluationKindTy::Complex: {
- assert(false && "Complex data type not handled");
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, SrcElementAddr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, DestElementAddr, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, DestElementAddr, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
case EvaluationKindTy::Aggregate: {
@@ -2861,7 +2875,21 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
break;
}
case EvaluationKindTy::Complex: {
- assert(false && "Complex data type not handled");
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobVal, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobVal, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
case EvaluationKindTy::Aggregate: {
@@ -3037,7 +3065,21 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
break;
}
case EvaluationKindTy::Complex: {
- assert(false && "Complex data type not handled");
+ Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobValPtr, 0, 0, ".realp");
+ Value *SrcReal = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+ Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, GlobValPtr, 0, 1, ".imagp");
+ Value *SrcImg = Builder.CreateLoad(
+ RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+ Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 0, ".realp");
+ Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+ RI.ElementType, ElemPtr, 0, 1, ".imagp");
+ Builder.CreateStore(SrcReal, DestRealPtr);
+ Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
case EvaluationKindTy::Aggregate: {
>From 446284a38d6600385a5b8777a1a43693dc3a8788 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 19 Mar 2024 19:33:58 +0000
Subject: [PATCH 14/18] Add complex reduction test.
---
clang/test/OpenMP/reduction_complex.c | 1072 +++++++++++++++++++++++++
1 file changed, 1072 insertions(+)
create mode 100644 clang/test/OpenMP/reduction_complex.c
diff --git a/clang/test/OpenMP/reduction_complex.c b/clang/test/OpenMP/reduction_complex.c
new file mode 100644
index 00000000000000..cbc5b14087239d
--- /dev/null
+++ b/clang/test/OpenMP/reduction_complex.c
@@ -0,0 +1,1072 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN: -triple powerpc64le-unknown-unknown \
+// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
+// RUN: %t-ppc-host.bc
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN: -triple nvptx64-unknown-unknown -DCUA \
+// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s \
+// RUN: -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
+// RUN: -o - | FileCheck %s --check-prefix CHECK
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ \
+// RUN: -triple powerpc64le-unknown-unknown -DDIAG\
+// RUN: -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
+// RUN: %s -o - | FileCheck %s \
+// RUN: --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ \
+// RUN: -triple i386-unknown-unknown \
+// RUN: -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
+// RUN: %s -o - | FileCheck %s \
+// RUN: --check-prefix=CHECK2
+
+// expected-no-diagnostics
+int foo() {
+ int i;
+ int j;
+ _Complex float sum = 0;
+
+#pragma omp target teams loop reduction(+:sum) collapse(2) bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
+
+ for(i=0; i<10; i++)
+ for(j=0; j<10; j++)
+ sum += i;
+
+ return 0;
+}
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[VAL_0:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_1:.*]] = alloca i64, align 8
+// CHECK-NEXT: %[[VAL_2:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_3:.*]] = alloca i64, align 8
+// CHECK-NEXT: %[[VAL_4:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_5:.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr %[[VAL_6:.*]], ptr %[[VAL_0]], align 8
+// CHECK-NEXT: store i64 %[[VAL_7:.*]], ptr %[[VAL_1]], align 8
+// CHECK-NEXT: store ptr %[[VAL_8:.*]], ptr %[[VAL_2]], align 8
+// CHECK-NEXT: %[[VAL_9:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK-NEXT: %[[VAL_10:.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_kernel_environment, ptr %[[VAL_6]])
+// CHECK-NEXT: %[[VAL_11:.*]] = icmp eq i32 %[[VAL_10]], -1
+// CHECK-NEXT: br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
+// CHECK: user_code.entry: ; preds = %[[VAL_14:.*]]
+// CHECK-NEXT: %[[VAL_15:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK-NEXT: %[[VAL_16:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK-NEXT: store i32 %[[VAL_16]], ptr %[[VAL_3]], align 4
+// CHECK-NEXT: %[[VAL_17:.*]] = load i64, ptr %[[VAL_3]], align 8
+// CHECK-NEXT: store i32 0, ptr %[[VAL_4]], align 4
+// CHECK-NEXT: store i32 %[[VAL_15]], ptr %[[VAL_5]], align 4
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined(ptr %[[VAL_5]], ptr %[[VAL_4]], i64 %[[VAL_17]], ptr %[[VAL_9]]) #2
+// CHECK-NEXT: call void @__kmpc_target_deinit()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[VAL_18:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_19:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_20:.*]] = alloca i64, align 8
+// CHECK-NEXT: %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_22:.*]] = alloca { float, float }, align 4
+// CHECK-NEXT: %[[VAL_23:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_24:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_25:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_26:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_27:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_28:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_29:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_30:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_31:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_32:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_33:.*]] = alloca i64, align 8
+// CHECK-NEXT: %[[VAL_34:.*]] = alloca [4 x ptr], align 8
+// CHECK-NEXT: %[[VAL_35:.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: store ptr %[[VAL_36:.*]], ptr %[[VAL_18]], align 8
+// CHECK-NEXT: store ptr %[[VAL_37:.*]], ptr %[[VAL_19]], align 8
+// CHECK-NEXT: store i64 %[[VAL_38:.*]], ptr %[[VAL_20]], align 8
+// CHECK-NEXT: store ptr %[[VAL_39:.*]], ptr %[[VAL_21]], align 8
+// CHECK-NEXT: %[[VAL_40:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK-NEXT: %[[VAL_41:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_42:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 1
+// CHECK-NEXT: store float 0.000000e+00, ptr %[[VAL_41]], align 4
+// CHECK-NEXT: store float 0.000000e+00, ptr %[[VAL_42]], align 4
+// CHECK-NEXT: store i32 0, ptr %[[VAL_26]], align 4
+// CHECK-NEXT: store i32 99, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: store i32 1, ptr %[[VAL_28]], align 4
+// CHECK-NEXT: store i32 0, ptr %[[VAL_29]], align 4
+// CHECK-NEXT: %[[VAL_43:.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-NEXT: %[[VAL_44:.*]] = load ptr, ptr %[[VAL_18]], align 8
+// CHECK-NEXT: %[[VAL_45:.*]] = load i32, ptr %[[VAL_44]], align 4
+// CHECK-NEXT: call void @__kmpc_distribute_static_init_4(ptr @2, i32 %[[VAL_45]], i32 91, ptr %[[VAL_29]], ptr %[[VAL_26]], ptr %[[VAL_27]], ptr %[[VAL_28]], i32 1, i32 %[[VAL_43]])
+// CHECK-NEXT: %[[VAL_46:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_47:.*]] = icmp sgt i32 %[[VAL_46]], 99
+// CHECK-NEXT: br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_49:.*]]
+// CHECK: cond.true: ; preds = %[[VAL_50:.*]]
+// CHECK-NEXT: br label %[[VAL_51:.*]]
+// CHECK: cond.false: ; preds = %[[VAL_50]]
+// CHECK-NEXT: %[[VAL_52:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: br label %[[VAL_51]]
+// CHECK: cond.end: ; preds = %[[VAL_49]], %[[VAL_48]]
+// CHECK-NEXT: %[[VAL_53:.*]] = phi i32 [ 99, %[[VAL_48]] ], [ %[[VAL_52]], %[[VAL_49]] ]
+// CHECK-NEXT: store i32 %[[VAL_53]], ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_54:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK-NEXT: store i32 %[[VAL_54]], ptr %[[VAL_23]], align 4
+// CHECK-NEXT: br label %[[VAL_55:.*]]
+// CHECK: omp.inner.for.cond: ; preds = %[[VAL_56:.*]], %[[VAL_51]]
+// CHECK-NEXT: %[[VAL_57:.*]] = load i32, ptr %[[VAL_23]], align 4
+// CHECK-NEXT: %[[VAL_58:.*]] = icmp slt i32 %[[VAL_57]], 100
+// CHECK-NEXT: br i1 %[[VAL_58]], label %[[VAL_59:.*]], label %[[VAL_60:.*]]
+// CHECK: omp.inner.for.body: ; preds = %[[VAL_55]]
+// CHECK-NEXT: %[[VAL_61:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK-NEXT: %[[VAL_62:.*]] = zext i32 %[[VAL_61]] to i64
+// CHECK-NEXT: %[[VAL_63:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_64:.*]] = zext i32 %[[VAL_63]] to i64
+// CHECK-NEXT: %[[VAL_65:.*]] = load i32, ptr %[[VAL_30]], align 4
+// CHECK-NEXT: store i32 %[[VAL_65]], ptr %[[VAL_33]], align 4
+// CHECK-NEXT: %[[VAL_66:.*]] = load i64, ptr %[[VAL_33]], align 8
+// CHECK-NEXT: %[[VAL_67:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_68:.*]] = inttoptr i64 %[[VAL_62]] to ptr
+// CHECK-NEXT: store ptr %[[VAL_68]], ptr %[[VAL_67]], align 8
+// CHECK-NEXT: %[[VAL_69:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 1
+// CHECK-NEXT: %[[VAL_70:.*]] = inttoptr i64 %[[VAL_64]] to ptr
+// CHECK-NEXT: store ptr %[[VAL_70]], ptr %[[VAL_69]], align 8
+// CHECK-NEXT: %[[VAL_71:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 2
+// CHECK-NEXT: %[[VAL_72:.*]] = inttoptr i64 %[[VAL_66]] to ptr
+// CHECK-NEXT: store ptr %[[VAL_72]], ptr %[[VAL_71]], align 8
+// CHECK-NEXT: %[[VAL_73:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 3
+// CHECK-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_73]], align 8
+// CHECK-NEXT: call void @__kmpc_parallel_51(ptr @1, i32 %[[VAL_45]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined_omp_outlined, ptr null, ptr %[[VAL_34]], i64 4)
+// CHECK-NEXT: br label %[[VAL_74:.*]]
+// CHECK: omp.inner.for.inc: ; preds = %[[VAL_59]]
+// CHECK-NEXT: %[[VAL_75:.*]] = load i32, ptr %[[VAL_23]], align 4
+// CHECK-NEXT: %[[VAL_76:.*]] = load i32, ptr %[[VAL_28]], align 4
+// CHECK-NEXT: %[[VAL_77:.*]] = add nsw i32 %[[VAL_75]], %[[VAL_76]]
+// CHECK-NEXT: store i32 %[[VAL_77]], ptr %[[VAL_23]], align 4
+// CHECK-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK-NEXT: %[[VAL_79:.*]] = load i32, ptr %[[VAL_28]], align 4
+// CHECK-NEXT: %[[VAL_80:.*]] = add nsw i32 %[[VAL_78]], %[[VAL_79]]
+// CHECK-NEXT: store i32 %[[VAL_80]], ptr %[[VAL_26]], align 4
+// CHECK-NEXT: %[[VAL_81:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_82:.*]] = load i32, ptr %[[VAL_28]], align 4
+// CHECK-NEXT: %[[VAL_83:.*]] = add nsw i32 %[[VAL_81]], %[[VAL_82]]
+// CHECK-NEXT: store i32 %[[VAL_83]], ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_85:.*]] = icmp sgt i32 %[[VAL_84]], 99
+// CHECK-NEXT: br i1 %[[VAL_85]], label %[[VAL_86:.*]], label %[[VAL_87:.*]]
+// CHECK: cond.true9: ; preds = %[[VAL_74]]
+// CHECK-NEXT: br label %[[VAL_56]]
+// CHECK: cond.false10: ; preds = %[[VAL_74]]
+// CHECK-NEXT: %[[VAL_88:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK-NEXT: br label %[[VAL_56]]
+// CHECK: cond.end11: ; preds = %[[VAL_87]], %[[VAL_86]]
+// CHECK-NEXT: %[[VAL_89:.*]] = phi i32 [ 99, %[[VAL_86]] ], [ %[[VAL_88]], %[[VAL_87]] ]
+// CHECK-NEXT: store i32 %[[VAL_89]], ptr %[[VAL_27]], align 4
+// CHECK-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK-NEXT: store i32 %[[VAL_90]], ptr %[[VAL_23]], align 4
+// CHECK-NEXT: br label %[[VAL_55]]
+// CHECK: omp.inner.for.end: ; preds = %[[VAL_55]]
+// CHECK-NEXT: br label %[[VAL_91:.*]]
+// CHECK: omp.loop.exit: ; preds = %[[VAL_60]]
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @3, i32 %[[VAL_45]])
+// CHECK-NEXT: %[[VAL_92:.*]] = load i32, ptr %[[VAL_29]], align 4
+// CHECK-NEXT: %[[VAL_93:.*]] = icmp ne i32 %[[VAL_92]], 0
+// CHECK-NEXT: br i1 %[[VAL_93]], label %[[VAL_94:.*]], label %[[VAL_95:.*]]
+// CHECK: .omp.lastprivate.then: ; preds = %[[VAL_91]]
+// CHECK-NEXT: store i32 10, ptr %[[VAL_30]], align 4
+// CHECK-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_30]], align 4
+// CHECK-NEXT: store i32 %[[VAL_96]], ptr %[[VAL_20]], align 4
+// CHECK-NEXT: br label %[[VAL_95]]
+// CHECK: .omp.lastprivate.done: ; preds = %[[VAL_94]], %[[VAL_91]]
+// CHECK-NEXT: %[[VAL_97:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_35]], i64 0, i64 0
+// CHECK-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_97]], align 8
+// CHECK-NEXT: %[[VAL_98:.*]]"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK-NEXT: %[[VAL_99:.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @1, ptr %[[VAL_98]]"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, i64 8, ptr %[[VAL_35]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK-NEXT: %[[VAL_100:.*]] = icmp eq i32 %[[VAL_99]], 1
+// CHECK-NEXT: br i1 %[[VAL_100]], label %[[VAL_101:.*]], label %[[VAL_102:.*]]
+// CHECK: .omp.reduction.then: ; preds = %[[VAL_95]]
+// CHECK-NEXT: %[[VAL_103:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_104:.*]] = load float, ptr %[[VAL_103]], align 4
+// CHECK-NEXT: %[[VAL_105:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 1
+// CHECK-NEXT: %[[VAL_106:.*]] = load float, ptr %[[VAL_105]], align 4
+// CHECK-NEXT: %[[VAL_107:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_108:.*]] = load float, ptr %[[VAL_107]], align 4
+// CHECK-NEXT: %[[VAL_109:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 1
+// CHECK-NEXT: %[[VAL_110:.*]] = load float, ptr %[[VAL_109]], align 4
+// CHECK-NEXT: %[[VAL_111:.*]] = fadd float %[[VAL_104]], %[[VAL_108]]
+// CHECK-NEXT: %[[VAL_112:.*]] = fadd float %[[VAL_106]], %[[VAL_110]]
+// CHECK-NEXT: %[[VAL_113:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 1
+// CHECK-NEXT: store float %[[VAL_111]], ptr %[[VAL_113]], align 4
+// CHECK-NEXT: store float %[[VAL_112]], ptr %[[VAL_114]], align 4
+// CHECK-NEXT: br label %[[VAL_102]]
+// CHECK: .omp.reduction.done: ; preds = %[[VAL_101]], %[[VAL_95]]
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[VAL_228:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_229:.*]] = alloca i16, align 2
+// CHECK-NEXT: %[[VAL_230:.*]] = alloca i16, align 2
+// CHECK-NEXT: %[[VAL_231:.*]] = alloca i16, align 2
+// CHECK-NEXT: %[[VAL_232:.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: %[[VAL_233:.*]] = alloca { float, float }, align 8
+// CHECK-NEXT: store ptr %[[VAL_234:.*]], ptr %[[VAL_228]], align 8
+// CHECK-NEXT: store i16 %[[VAL_235:.*]], ptr %[[VAL_229]], align 2
+// CHECK-NEXT: store i16 %[[VAL_236:.*]], ptr %[[VAL_230]], align 2
+// CHECK-NEXT: store i16 %[[VAL_237:.*]], ptr %[[VAL_231]], align 2
+// CHECK-NEXT: %[[VAL_238:.*]] = load ptr, ptr %[[VAL_228]], align 8
+// CHECK-NEXT: %[[VAL_239:.*]] = load i16, ptr %[[VAL_229]], align 2
+// CHECK-NEXT: %[[VAL_240:.*]] = load i16, ptr %[[VAL_230]], align 2
+// CHECK-NEXT: %[[VAL_241:.*]] = load i16, ptr %[[VAL_231]], align 2
+// CHECK-NEXT: %[[VAL_242:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_243:.*]] = load ptr, ptr %[[VAL_242]], align 8
+// CHECK-NEXT: %[[VAL_244:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_245:.*]] = getelementptr { float, float }, ptr %[[VAL_243]], i64 1
+// CHECK-NEXT: %[[VAL_246:.*]] = load i64, ptr %[[VAL_243]], align 8
+// CHECK-NEXT: %[[VAL_247:.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-NEXT: %[[VAL_248:.*]] = trunc i32 %[[VAL_247]] to i16
+// CHECK-NEXT: %[[VAL_249:.*]] = call i64 @__kmpc_shuffle_int64(i64 %[[VAL_246]], i16 %[[VAL_240]], i16 %[[VAL_248]])
+// CHECK-NEXT: store i64 %[[VAL_249]], ptr %[[VAL_233]], align 8
+// CHECK-NEXT: %[[VAL_250:.*]] = getelementptr i64, ptr %[[VAL_243]], i64 1
+// CHECK-NEXT: %[[VAL_251:.*]] = getelementptr i64, ptr %[[VAL_233]], i64 1
+// CHECK-NEXT: store ptr %[[VAL_233]], ptr %[[VAL_244]], align 8
+// CHECK-NEXT: %[[VAL_252:.*]] = icmp eq i16 %[[VAL_241]], 0
+// CHECK-NEXT: %[[VAL_253:.*]] = icmp eq i16 %[[VAL_241]], 1
+// CHECK-NEXT: %[[VAL_254:.*]] = icmp ult i16 %[[VAL_239]], %[[VAL_240]]
+// CHECK-NEXT: %[[VAL_255:.*]] = and i1 %[[VAL_253]], %[[VAL_254]]
+// CHECK-NEXT: %[[VAL_256:.*]] = icmp eq i16 %[[VAL_241]], 2
+// CHECK-NEXT: %[[VAL_257:.*]] = and i16 %[[VAL_239]], 1
+// CHECK-NEXT: %[[VAL_258:.*]] = icmp eq i16 %[[VAL_257]], 0
+// CHECK-NEXT: %[[VAL_259:.*]] = and i1 %[[VAL_256]], %[[VAL_258]]
+// CHECK-NEXT: %[[VAL_260:.*]] = icmp sgt i16 %[[VAL_240]], 0
+// CHECK-NEXT: %[[VAL_261:.*]] = and i1 %[[VAL_259]], %[[VAL_260]]
+// CHECK-NEXT: %[[VAL_262:.*]] = or i1 %[[VAL_252]], %[[VAL_255]]
+// CHECK-NEXT: %[[VAL_263:.*]] = or i1 %[[VAL_262]], %[[VAL_261]]
+// CHECK-NEXT: br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_265:.*]]
+// CHECK: then: ; preds = %[[VAL_266:.*]]
+// CHECK-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
+// CHECK-NEXT: br label %[[VAL_267:.*]]
+// CHECK: else: ; preds = %[[VAL_266]]
+// CHECK-NEXT: br label %[[VAL_267]]
+// CHECK: ifcont: ; preds = %[[VAL_265]], %[[VAL_264]]
+// CHECK-NEXT: %[[VAL_268:.*]] = icmp eq i16 %[[VAL_241]], 1
+// CHECK-NEXT: %[[VAL_269:.*]] = icmp uge i16 %[[VAL_239]], %[[VAL_240]]
+// CHECK-NEXT: %[[VAL_270:.*]] = and i1 %[[VAL_268]], %[[VAL_269]]
+// CHECK-NEXT: br i1 %[[VAL_270]], label %[[VAL_271:.*]], label %[[VAL_272:.*]]
+// CHECK: then4: ; preds = %[[VAL_267]]
+// CHECK-NEXT: %[[VAL_273:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_274:.*]] = load ptr, ptr %[[VAL_273]], align 8
+// CHECK-NEXT: %[[VAL_275:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_238]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_276:.*]] = load ptr, ptr %[[VAL_275]], align 8
+// CHECK-NEXT: %[[VAL_277:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_278:.*]] = load float, ptr %[[VAL_277]], align 4
+// CHECK-NEXT: %[[VAL_279:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_274]], i32 0, i32 1
+// CHECK-NEXT: %[[VAL_280:.*]] = load float, ptr %[[VAL_279]], align 4
+// CHECK-NEXT: %[[VAL_281:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 0
+// CHECK-NEXT: %[[VAL_282:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_276]], i32 0, i32 1
+// CHECK-NEXT: store float %[[VAL_278]], ptr %[[VAL_281]], align 4
+// CHECK-NEXT: store float %[[VAL_280]], ptr %[[VAL_282]], align 4
+// CHECK-NEXT: br label %[[VAL_283:.*]]
+// CHECK: else7: ; preds = %[[VAL_267]]
+// CHECK-NEXT: br label %[[VAL_283]]
+// CHECK: ifcont8: ; preds = %[[VAL_272]], %[[VAL_271]]
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[VAL_400:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[VAL_401:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_402:.*]] = alloca i32, align 4
+// CHECK-NEXT: %[[VAL_403:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK-NEXT: store ptr %[[VAL_404:.*]], ptr %[[VAL_400]], align 8
+// CHECK-NEXT: store i32 %[[VAL_405:.*]], ptr %[[VAL_401]], align 4
+// CHECK-NEXT: %[[VAL_406:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: %[[VAL_407:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: %[[VAL_408:.*]] = and i32 %[[VAL_407]], 31
+// CHECK-NEXT: %[[VAL_409:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: %[[VAL_410:.*]] = ashr i32 %[[VAL_409]], 5
+// CHECK-NEXT: %[[VAL_411:.*]] = load ptr, ptr %[[VAL_400]], align 8
+// CHECK-NEXT: store i32 0, ptr %[[VAL_402]], align 4
+// CHECK-NEXT: br label %[[VAL_412:.*]]
+// CHECK: precond: ; preds = %[[VAL_413:.*]], %[[VAL_414:.*]]
+// CHECK-NEXT: %[[VAL_415:.*]] = load i32, ptr %[[VAL_402]], align 4
+// CHECK-NEXT: %[[VAL_416:.*]] = icmp ult i32 %[[VAL_415]], 2
+// CHECK-NEXT: br i1 %[[VAL_416]], label %[[VAL_417:.*]], label %[[VAL_418:.*]]
+// CHECK: body: ; preds = %[[VAL_412]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @4, i32 %[[VAL_403]])
+// CHECK-NEXT: %[[VAL_419:.*]] = icmp eq i32 %[[VAL_408]], 0
+// CHECK-NEXT: br i1 %[[VAL_419]], label %[[VAL_420:.*]], label %[[VAL_421:.*]]
+// CHECK: then: ; preds = %[[VAL_417]]
+// CHECK-NEXT: %[[VAL_422:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_411]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_423:.*]] = load ptr, ptr %[[VAL_422]], align 8
+// CHECK-NEXT: %[[VAL_424:.*]] = getelementptr i32, ptr %[[VAL_423]], i32 %[[VAL_415]]
+// CHECK-NEXT: %[[VAL_425:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %[[VAL_410]]
+// CHECK-NEXT: %[[VAL_426:.*]] = load i32, ptr %[[VAL_424]], align 4
+// CHECK-NEXT: store volatile i32 %[[VAL_426]], ptr addrspace(3) %[[VAL_425]], align 4
+// CHECK-NEXT: br label %[[VAL_427:.*]]
+// CHECK: else: ; preds = %[[VAL_417]]
+// CHECK-NEXT: br label %[[VAL_427]]
+// CHECK: ifcont: ; preds = %[[VAL_421]], %[[VAL_420]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @4, i32 %[[VAL_403]])
+// CHECK-NEXT: %[[VAL_428:.*]] = load i32, ptr %[[VAL_401]], align 4
+// CHECK-NEXT: %[[VAL_429:.*]] = icmp ult i32 %[[VAL_406]], %[[VAL_428]]
+// CHECK-NEXT: br i1 %[[VAL_429]], label %[[VAL_430:.*]], label %[[VAL_431:.*]]
+// CHECK: then2: ; preds = %[[VAL_427]]
+// CHECK-NEXT: %[[VAL_432:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %[[VAL_406]]
+// CHECK-NEXT: %[[VAL_433:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_411]], i64 0, i64 0
+// CHECK-NEXT: %[[VAL_434:.*]] = load ptr, ptr %[[VAL_433]], align 8
+// CHECK-NEXT: %[[VAL_435:.*]] = getelementptr i32, ptr %[[VAL_434]], i32 %[[VAL_415]]
+// CHECK-NEXT: %[[VAL_436:.*]] = load volatile i32, ptr addrspace(3) %[[VAL_432]], align 4
+// CHECK-NEXT: store i32 %[[VAL_436]], ptr %[[VAL_435]], align 4
+// CHECK-NEXT: br label %[[VAL_413]]
+// CHECK: else3: ; preds = %[[VAL_427]]
+// CHECK-NEXT: br label %[[VAL_413]]
+// CHECK: ifcont4: ; preds = %[[VAL_431]], %[[VAL_430]]
+// CHECK-NEXT: %[[VAL_437:.*]] = add nsw i32 %[[VAL_415]], 1
+// CHECK-NEXT: store i32 %[[VAL_437]], ptr %[[VAL_402]], align 4
+// CHECK-NEXT: br label %[[VAL_412]]
+// CHECK: exit: ; preds = %[[VAL_412]]
+// CHECK-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_2:.*]] = alloca { float, float }, align 4
+// CHECK1-NEXT: %[[VAL_3:.*]] = alloca i64, align 8
+// CHECK1-NEXT: %[[VAL_4:.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: %[[VAL_5:.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: %[[VAL_6:.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: %[[VAL_7:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_8:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_9:.*]] = alloca %[[VAL_10:.*]], align 8
+// CHECK1-NEXT: %[[VAL_11:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_12:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 1
+// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_11]], align 4
+// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_12]], align 4
+// CHECK1-NEXT: %[[VAL_13:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK1-NEXT: store i32 %[[VAL_13]], ptr %[[VAL_3]], align 4
+// CHECK1-NEXT: %[[VAL_14:.*]] = load i64, ptr %[[VAL_3]], align 8
+// CHECK1-NEXT: %[[VAL_15:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
+// CHECK1-NEXT: store i64 %[[VAL_14]], ptr %[[VAL_15]], align 8
+// CHECK1-NEXT: %[[VAL_16:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
+// CHECK1-NEXT: store i64 %[[VAL_14]], ptr %[[VAL_16]], align 8
+// CHECK1-NEXT: %[[VAL_17:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr %[[VAL_17]], align 8
+// CHECK1-NEXT: %[[VAL_18:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 1
+// CHECK1-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_18]], align 8
+// CHECK1-NEXT: %[[VAL_19:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 1
+// CHECK1-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_19]], align 8
+// CHECK1-NEXT: %[[VAL_20:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr %[[VAL_20]], align 8
+// CHECK1-NEXT: %[[VAL_21:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_22:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_23:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr %[[VAL_23]], align 4
+// CHECK1-NEXT: %[[VAL_24:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 1
+// CHECK1-NEXT: store i32 2, ptr %[[VAL_24]], align 4
+// CHECK1-NEXT: %[[VAL_25:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 2
+// CHECK1-NEXT: store ptr %[[VAL_21]], ptr %[[VAL_25]], align 8
+// CHECK1-NEXT: %[[VAL_26:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 3
+// CHECK1-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_26]], align 8
+// CHECK1-NEXT: %[[VAL_27:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr %[[VAL_27]], align 8
+// CHECK1-NEXT: %[[VAL_28:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr %[[VAL_28]], align 8
+// CHECK1-NEXT: %[[VAL_29:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr %[[VAL_29]], align 8
+// CHECK1-NEXT: %[[VAL_30:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr %[[VAL_30]], align 8
+// CHECK1-NEXT: %[[VAL_31:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr %[[VAL_31]], align 8
+// CHECK1-NEXT: %[[VAL_32:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr %[[VAL_32]], align 8
+// CHECK1-NEXT: %[[VAL_33:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_33]], align 4
+// CHECK1-NEXT: %[[VAL_34:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_34]], align 4
+// CHECK1-NEXT: %[[VAL_35:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr %[[VAL_35]], align 4
+// CHECK1-NEXT: %[[VAL_36:.*]] = call i32 @__tgt_target_kernel(ptr @4, i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.region_id, ptr %[[VAL_9]])
+// CHECK1-NEXT: %[[VAL_37:.*]] = icmp ne i32 %[[VAL_36]], 0
+// CHECK1-NEXT: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
+// CHECK1: omp_offload.failed: ; preds = %[[VAL_40:.*]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31(i64 %[[VAL_14]], ptr %[[VAL_2]]) #2
+// CHECK1-NEXT: br label %[[VAL_39]]
+// CHECK1: omp_offload.cont: ; preds = %[[VAL_38]], %[[VAL_40]]
+// CHECK1-NEXT: ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
+// CHECK1-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: %[[VAL_41:.*]] = alloca i64, align 8
+// CHECK1-NEXT: %[[VAL_42:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: %[[VAL_43:.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 %[[VAL_44:.*]], ptr %[[VAL_41]], align 8
+// CHECK1-NEXT: store ptr %[[VAL_45:.*]], ptr %[[VAL_42]], align 8
+// CHECK1-NEXT: %[[VAL_46:.*]] = load ptr, ptr %[[VAL_42]], align 8
+// CHECK1-NEXT: %[[VAL_47:.*]] = load i32, ptr %[[VAL_41]], align 4
+// CHECK1-NEXT: store i32 %[[VAL_47]], ptr %[[VAL_43]], align 4
+// CHECK1-NEXT: %[[VAL_48:.*]] = load i64, ptr %[[VAL_43]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @4, i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined, i64 %[[VAL_48]], ptr %[[VAL_46]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: %[[VAL_49:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: %[[VAL_50:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: %[[VAL_51:.*]] = alloca i64, align 8
+// CHECK1-NEXT: %[[VAL_52:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: %[[VAL_53:.*]] = alloca { float, float }, align 4
+// CHECK1-NEXT: %[[VAL_54:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_55:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_56:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_57:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_58:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_59:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_60:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_61:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_62:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_63:.*]] = alloca i32, align 4
+// CHECK1-NEXT: %[[VAL_64:.*]] = alloca i64, align 8
+// CHECK1-NEXT: %[[VAL_65:.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: %[[VAL_66:.*]] = alloca { float, float }, align 4
+// CHECK1-NEXT: %[[VAL_67:.*]] = alloca { float, float }, align 4
+// CHECK1-NEXT: %[[VAL_68:.*]] = alloca { float, float }, align 4
+// CHECK1-NEXT: store ptr %[[VAL_69:.*]], ptr %[[VAL_49]], align 8
+// CHECK1-NEXT: store ptr %[[VAL_70:.*]], ptr %[[VAL_50]], align 8
+// CHECK1-NEXT: store i64 %[[VAL_71:.*]], ptr %[[VAL_51]], align 8
+// CHECK1-NEXT: store ptr %[[VAL_72:.*]], ptr %[[VAL_52]], align 8
+// CHECK1-NEXT: %[[VAL_73:.*]] = load ptr, ptr %[[VAL_52]], align 8
+// CHECK1-NEXT: %[[VAL_74:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_75:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_74]], align 4
+// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_75]], align 4
+// CHECK1-NEXT: store i32 0, ptr %[[VAL_57]], align 4
+// CHECK1-NEXT: store i32 99, ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: store i32 1, ptr %[[VAL_59]], align 4
+// CHECK1-NEXT: store i32 0, ptr %[[VAL_60]], align 4
+// CHECK1-NEXT: %[[VAL_76:.*]] = load ptr, ptr %[[VAL_49]], align 8
+// CHECK1-NEXT: %[[VAL_77:.*]] = load i32, ptr %[[VAL_76]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @1, i32 %[[VAL_77]], i32 92, ptr %[[VAL_60]], ptr %[[VAL_57]], ptr %[[VAL_58]], ptr %[[VAL_59]], i32 1, i32 1)
+// CHECK1-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: %[[VAL_79:.*]] = icmp sgt i32 %[[VAL_78]], 99
+// CHECK1-NEXT: br i1 %[[VAL_79]], label %[[VAL_80:.*]], label %[[VAL_81:.*]]
+// CHECK1: cond.true: ; preds = %[[VAL_82:.*]]
+// CHECK1-NEXT: br label %[[VAL_83:.*]]
+// CHECK1: cond.false: ; preds = %[[VAL_82]]
+// CHECK1-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: br label %[[VAL_83]]
+// CHECK1: cond.end: ; preds = %[[VAL_81]], %[[VAL_80]]
+// CHECK1-NEXT: %[[VAL_85:.*]] = phi i32 [ 99, %[[VAL_80]] ], [ %[[VAL_84]], %[[VAL_81]] ]
+// CHECK1-NEXT: store i32 %[[VAL_85]], ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: %[[VAL_86:.*]] = load i32, ptr %[[VAL_57]], align 4
+// CHECK1-NEXT: store i32 %[[VAL_86]], ptr %[[VAL_54]], align 4
+// CHECK1-NEXT: br label %[[VAL_87:.*]]
+// CHECK1: omp.inner.for.cond: ; preds = %[[VAL_88:.*]], %[[VAL_83]]
+// CHECK1-NEXT: %[[VAL_89:.*]] = load i32, ptr %[[VAL_54]], align 4
+// CHECK1-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: %[[VAL_91:.*]] = icmp sle i32 %[[VAL_89]], %[[VAL_90]]
+// CHECK1-NEXT: br i1 %[[VAL_91]], label %[[VAL_92:.*]], label %[[VAL_93:.*]]
+// CHECK1: omp.inner.for.body: ; preds = %[[VAL_87]]
+// CHECK1-NEXT: %[[VAL_94:.*]] = load i32, ptr %[[VAL_57]], align 4
+// CHECK1-NEXT: %[[VAL_95:.*]] = zext i32 %[[VAL_94]] to i64
+// CHECK1-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK1-NEXT: %[[VAL_97:.*]] = zext i32 %[[VAL_96]] to i64
+// CHECK1-NEXT: %[[VAL_98:.*]] = load i32, ptr %[[VAL_61]], align 4
+// CHECK1-NEXT: store i32 %[[VAL_98]], ptr %[[VAL_64]], align 4
+// CHECK1-NEXT: %[[VAL_99:.*]] = load i64, ptr %[[VAL_64]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @4, i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined, i64 %[[VAL_95]], i64 %[[VAL_97]], i64 %[[VAL_99]], ptr %[[VAL_53]])
+// CHECK1-NEXT: br label %[[VAL_88]]
+// CHECK1: omp.inner.for.inc: ; preds = %[[VAL_92]]
+// CHECK1-NEXT: %[[VAL_100:.*]] = load i32, ptr %[[VAL_54]], align 4
+// CHECK1-NEXT: %[[VAL_101:.*]] = load i32, ptr %[[VAL_59]], align 4
+// CHECK1-NEXT: %[[VAL_102:.*]] = add nsw i32 %[[VAL_100]], %[[VAL_101]]
+// CHECK1-NEXT: store i32 %[[VAL_102]], ptr %[[VAL_54]], align 4
+// CHECK1-NEXT: br label %[[VAL_87]]
+// CHECK1: omp.inner.for.end: ; preds = %[[VAL_87]]
+// CHECK1-NEXT: br label %[[VAL_103:.*]]
+// CHECK1: omp.loop.exit: ; preds = %[[VAL_93]]
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_77]])
+// CHECK1-NEXT: %[[VAL_104:.*]] = load i32, ptr %[[VAL_60]], align 4
+// CHECK1-NEXT: %[[VAL_105:.*]] = icmp ne i32 %[[VAL_104]], 0
+// CHECK1-NEXT: br i1 %[[VAL_105]], label %[[VAL_106:.*]], label %[[VAL_107:.*]]
+// CHECK1: .omp.lastprivate.then: ; preds = %[[VAL_103]]
+// CHECK1-NEXT: store i32 10, ptr %[[VAL_61]], align 4
+// CHECK1-NEXT: %[[VAL_108:.*]] = load i32, ptr %[[VAL_61]], align 4
+// CHECK1-NEXT: store i32 %[[VAL_108]], ptr %[[VAL_51]], align 4
+// CHECK1-NEXT: br label %[[VAL_107]]
+// CHECK1: .omp.lastprivate.done: ; preds = %[[VAL_106]], %[[VAL_103]]
+// CHECK1-NEXT: %[[VAL_109:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_65]], i64 0, i64 0
+// CHECK1-NEXT: store ptr %[[VAL_53]], ptr %[[VAL_109]], align 8
+// CHECK1-NEXT: %[[VAL_110:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_77]], i32 1, i64 8, ptr %[[VAL_65]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 %[[VAL_110]], label %[[VAL_111:.*]] [
+// CHECK1-NEXT: i32 1, label %[[VAL_112:.*]]
+// CHECK1-NEXT: i32 2, label %[[VAL_113:.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1: ; preds = %[[VAL_107]]
+// CHECK1-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_115:.*]] = load float, ptr %[[VAL_114]], align 4
+// CHECK1-NEXT: %[[VAL_116:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_117:.*]] = load float, ptr %[[VAL_116]], align 4
+// CHECK1-NEXT: %[[VAL_118:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_119:.*]] = load float, ptr %[[VAL_118]], align 4
+// CHECK1-NEXT: %[[VAL_120:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_121:.*]] = load float, ptr %[[VAL_120]], align 4
+// CHECK1-NEXT: %[[VAL_122:.*]] = fadd float %[[VAL_115]], %[[VAL_119]]
+// CHECK1-NEXT: %[[VAL_123:.*]] = fadd float %[[VAL_117]], %[[VAL_121]]
+// CHECK1-NEXT: %[[VAL_124:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_125:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
+// CHECK1-NEXT: store float %[[VAL_122]], ptr %[[VAL_124]], align 4
+// CHECK1-NEXT: store float %[[VAL_123]], ptr %[[VAL_125]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label %[[VAL_111]]
+// CHECK1: .omp.reduction.case2: ; preds = %[[VAL_107]]
+// CHECK1-NEXT: %[[VAL_126:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_127:.*]] = load float, ptr %[[VAL_126]], align 4
+// CHECK1-NEXT: %[[VAL_128:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_129:.*]] = load float, ptr %[[VAL_128]], align 4
+// CHECK1-NEXT: call void @__atomic_load(i64 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], i32 noundef signext 0)
+// CHECK1-NEXT: br label %[[VAL_130:.*]]
+// CHECK1: atomic_cont: ; preds = %[[VAL_130]], %[[VAL_113]]
+// CHECK1-NEXT: %[[VAL_131:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_132:.*]] = load float, ptr %[[VAL_131]], align 4
+// CHECK1-NEXT: %[[VAL_133:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_134:.*]] = load float, ptr %[[VAL_133]], align 4
+// CHECK1-NEXT: %[[VAL_135:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_136:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
+// CHECK1-NEXT: store float %[[VAL_132]], ptr %[[VAL_135]], align 4
+// CHECK1-NEXT: store float %[[VAL_134]], ptr %[[VAL_136]], align 4
+// CHECK1-NEXT: %[[VAL_137:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4
+// CHECK1-NEXT: %[[VAL_139:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_140:.*]] = load float, ptr %[[VAL_139]], align 4
+// CHECK1-NEXT: %[[VAL_141:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_142:.*]] = load float, ptr %[[VAL_141]], align 4
+// CHECK1-NEXT: %[[VAL_143:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_144:.*]] = load float, ptr %[[VAL_143]], align 4
+// CHECK1-NEXT: %[[VAL_145:.*]] = fadd float %[[VAL_138]], %[[VAL_142]]
+// CHECK1-NEXT: %[[VAL_146:.*]] = fadd float %[[VAL_140]], %[[VAL_144]]
+// CHECK1-NEXT: %[[VAL_147:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_148:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 1
+// CHECK1-NEXT: store float %[[VAL_145]], ptr %[[VAL_147]], align 4
+// CHECK1-NEXT: store float %[[VAL_146]], ptr %[[VAL_148]], align 4
+// CHECK1-NEXT: %[[VAL_149:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], ptr noundef %[[VAL_67]], i32 noundef signext 0, i32 noundef signext 0)
+// CHECK1-NEXT: br i1 %[[VAL_149]], label %[[VAL_150:.*]], label %[[VAL_130]]
+// CHECK1: atomic_exit: ; preds = %[[VAL_130]]
+// CHECK1-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label %[[VAL_111]]
+// CHECK1: .omp.reduction.default: ; preds = %[[VAL_150]], %[[VAL_112]], %[[VAL_107]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: %[[VAL_297:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: %[[VAL_298:.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr %[[VAL_299:.*]], ptr %[[VAL_297]], align 8
+// CHECK1-NEXT: store ptr %[[VAL_300:.*]], ptr %[[VAL_298]], align 8
+// CHECK1-NEXT: %[[VAL_301:.*]] = load ptr, ptr %[[VAL_297]], align 8
+// CHECK1-NEXT: %[[VAL_302:.*]] = load ptr, ptr %[[VAL_298]], align 8
+// CHECK1-NEXT: %[[VAL_303:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_302]], i64 0, i64 0
+// CHECK1-NEXT: %[[VAL_304:.*]] = load ptr, ptr %[[VAL_303]], align 8
+// CHECK1-NEXT: %[[VAL_305:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_301]], i64 0, i64 0
+// CHECK1-NEXT: %[[VAL_306:.*]] = load ptr, ptr %[[VAL_305]], align 8
+// CHECK1-NEXT: %[[VAL_307:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_308:.*]] = load float, ptr %[[VAL_307]], align 4
+// CHECK1-NEXT: %[[VAL_309:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_310:.*]] = load float, ptr %[[VAL_309]], align 4
+// CHECK1-NEXT: %[[VAL_311:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_304]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_312:.*]] = load float, ptr %[[VAL_311]], align 4
+// CHECK1-NEXT: %[[VAL_313:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_304]], i32 0, i32 1
+// CHECK1-NEXT: %[[VAL_314:.*]] = load float, ptr %[[VAL_313]], align 4
+// CHECK1-NEXT: %[[VAL_315:.*]] = fadd float %[[VAL_308]], %[[VAL_312]]
+// CHECK1-NEXT: %[[VAL_316:.*]] = fadd float %[[VAL_310]], %[[VAL_314]]
+// CHECK1-NEXT: %[[VAL_317:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 0
+// CHECK1-NEXT: %[[VAL_318:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 1
+// CHECK1-NEXT: store float %[[VAL_315]], ptr %[[VAL_317]], align 4
+// CHECK1-NEXT: store float %[[VAL_316]], ptr %[[VAL_318]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_2:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_4:.*]] = alloca [2 x ptr], align 4
+// CHECK2-NEXT: %[[VAL_5:.*]] = alloca [2 x ptr], align 4
+// CHECK2-NEXT: %[[VAL_6:.*]] = alloca [2 x ptr], align 4
+// CHECK2-NEXT: %[[VAL_7:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_8:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_9:.*]] = alloca %[[VAL_10:.*]], align 8
+// CHECK2-NEXT: %[[VAL_11:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_12:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 1
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_11]], align 4
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_12]], align 4
+// CHECK2-NEXT: %[[VAL_13:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_13]], ptr %[[VAL_3]], align 4
+// CHECK2-NEXT: %[[VAL_14:.*]] = load i32, ptr %[[VAL_3]], align 4
+// CHECK2-NEXT: %[[VAL_15:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
+// CHECK2-NEXT: store i32 %[[VAL_14]], ptr %[[VAL_15]], align 4
+// CHECK2-NEXT: %[[VAL_16:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
+// CHECK2-NEXT: store i32 %[[VAL_14]], ptr %[[VAL_16]], align 4
+// CHECK2-NEXT: %[[VAL_17:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i32 0, i32 0
+// CHECK2-NEXT: store ptr null, ptr %[[VAL_17]], align 4
+// CHECK2-NEXT: %[[VAL_18:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 1
+// CHECK2-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_18]], align 4
+// CHECK2-NEXT: %[[VAL_19:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 1
+// CHECK2-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_19]], align 4
+// CHECK2-NEXT: %[[VAL_20:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i32 0, i32 1
+// CHECK2-NEXT: store ptr null, ptr %[[VAL_20]], align 4
+// CHECK2-NEXT: %[[VAL_21:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_22:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_23:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 0
+// CHECK2-NEXT: store i32 2, ptr %[[VAL_23]], align 4
+// CHECK2-NEXT: %[[VAL_24:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 1
+// CHECK2-NEXT: store i32 2, ptr %[[VAL_24]], align 4
+// CHECK2-NEXT: %[[VAL_25:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 2
+// CHECK2-NEXT: store ptr %[[VAL_21]], ptr %[[VAL_25]], align 4
+// CHECK2-NEXT: %[[VAL_26:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 3
+// CHECK2-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_26]], align 4
+// CHECK2-NEXT: %[[VAL_27:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 4
+// CHECK2-NEXT: store ptr @.offload_sizes, ptr %[[VAL_27]], align 4
+// CHECK2-NEXT: %[[VAL_28:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 5
+// CHECK2-NEXT: store ptr @.offload_maptypes, ptr %[[VAL_28]], align 4
+// CHECK2-NEXT: %[[VAL_29:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 6
+// CHECK2-NEXT: store ptr null, ptr %[[VAL_29]], align 4
+// CHECK2-NEXT: %[[VAL_30:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 7
+// CHECK2-NEXT: store ptr null, ptr %[[VAL_30]], align 4
+// CHECK2-NEXT: %[[VAL_31:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 8
+// CHECK2-NEXT: store i64 100, ptr %[[VAL_31]], align 8
+// CHECK2-NEXT: %[[VAL_32:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 9
+// CHECK2-NEXT: store i64 0, ptr %[[VAL_32]], align 8
+// CHECK2-NEXT: %[[VAL_33:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 10
+// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_33]], align 4
+// CHECK2-NEXT: %[[VAL_34:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 11
+// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_34]], align 4
+// CHECK2-NEXT: %[[VAL_35:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 12
+// CHECK2-NEXT: store i32 0, ptr %[[VAL_35]], align 4
+// CHECK2-NEXT: %[[VAL_36:.*]] = call i32 @__tgt_target_kernel(ptr @4, i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.region_id, ptr %[[VAL_9]])
+// CHECK2-NEXT: %[[VAL_37:.*]] = icmp ne i32 %[[VAL_36]], 0
+// CHECK2-NEXT: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
+// CHECK2: omp_offload.failed: ; preds = %[[VAL_40:.*]]
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31(i32 %[[VAL_14]], ptr %[[VAL_2]]) #2
+// CHECK2-NEXT: br label %[[VAL_39]]
+// CHECK2: omp_offload.cont: ; preds = %[[VAL_38]], %[[VAL_40]]
+// CHECK2-NEXT: ret i32 0
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
+// CHECK2-SAME: (i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_41:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_42:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_43:.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 %[[VAL_44:.*]], ptr %[[VAL_41]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_45:.*]], ptr %[[VAL_42]], align 4
+// CHECK2-NEXT: %[[VAL_46:.*]] = load ptr, ptr %[[VAL_42]], align 4
+// CHECK2-NEXT: %[[VAL_47:.*]] = load i32, ptr %[[VAL_41]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_47]], ptr %[[VAL_43]], align 4
+// CHECK2-NEXT: %[[VAL_48:.*]] = load i32, ptr %[[VAL_43]], align 4
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @4, i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined, i32 %[[VAL_48]], ptr %[[VAL_46]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_49:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_50:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_51:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_52:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_53:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_54:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_55:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_56:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_57:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_58:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_59:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_60:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_61:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_62:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_63:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_64:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_65:.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT: %[[VAL_66:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_67:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_68:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: store ptr %[[VAL_69:.*]], ptr %[[VAL_49]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_70:.*]], ptr %[[VAL_50]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_71:.*]], ptr %[[VAL_51]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_72:.*]], ptr %[[VAL_52]], align 4
+// CHECK2-NEXT: %[[VAL_73:.*]] = load ptr, ptr %[[VAL_52]], align 4
+// CHECK2-NEXT: %[[VAL_74:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_75:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_74]], align 4
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_75]], align 4
+// CHECK2-NEXT: store i32 0, ptr %[[VAL_57]], align 4
+// CHECK2-NEXT: store i32 99, ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: store i32 1, ptr %[[VAL_59]], align 4
+// CHECK2-NEXT: store i32 0, ptr %[[VAL_60]], align 4
+// CHECK2-NEXT: %[[VAL_76:.*]] = load ptr, ptr %[[VAL_49]], align 4
+// CHECK2-NEXT: %[[VAL_77:.*]] = load i32, ptr %[[VAL_76]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @1, i32 %[[VAL_77]], i32 92, ptr %[[VAL_60]], ptr %[[VAL_57]], ptr %[[VAL_58]], ptr %[[VAL_59]], i32 1, i32 1)
+// CHECK2-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: %[[VAL_79:.*]] = icmp sgt i32 %[[VAL_78]], 99
+// CHECK2-NEXT: br i1 %[[VAL_79]], label %[[VAL_80:.*]], label %[[VAL_81:.*]]
+// CHECK2: cond.true: ; preds = %[[VAL_82:.*]]
+// CHECK2-NEXT: br label %[[VAL_83:.*]]
+// CHECK2: cond.false: ; preds = %[[VAL_82]]
+// CHECK2-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: br label %[[VAL_83]]
+// CHECK2: cond.end: ; preds = %[[VAL_81]], %[[VAL_80]]
+// CHECK2-NEXT: %[[VAL_85:.*]] = phi i32 [ 99, %[[VAL_80]] ], [ %[[VAL_84]], %[[VAL_81]] ]
+// CHECK2-NEXT: store i32 %[[VAL_85]], ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: %[[VAL_86:.*]] = load i32, ptr %[[VAL_57]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_86]], ptr %[[VAL_54]], align 4
+// CHECK2-NEXT: br label %[[VAL_87:.*]]
+// CHECK2: omp.inner.for.cond: ; preds = %[[VAL_88:.*]], %[[VAL_83]]
+// CHECK2-NEXT: %[[VAL_89:.*]] = load i32, ptr %[[VAL_54]], align 4
+// CHECK2-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: %[[VAL_91:.*]] = icmp sle i32 %[[VAL_89]], %[[VAL_90]]
+// CHECK2-NEXT: br i1 %[[VAL_91]], label %[[VAL_92:.*]], label %[[VAL_93:.*]]
+// CHECK2: omp.inner.for.body: ; preds = %[[VAL_87]]
+// CHECK2-NEXT: %[[VAL_94:.*]] = load i32, ptr %[[VAL_57]], align 4
+// CHECK2-NEXT: %[[VAL_95:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK2-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_61]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_96]], ptr %[[VAL_64]], align 4
+// CHECK2-NEXT: %[[VAL_97:.*]] = load i32, ptr %[[VAL_64]], align 4
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @4, i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined, i32 %[[VAL_94]], i32 %[[VAL_95]], i32 %[[VAL_97]], ptr %[[VAL_53]])
+// CHECK2-NEXT: br label %[[VAL_88]]
+// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_92]]
+// CHECK2-NEXT: %[[VAL_98:.*]] = load i32, ptr %[[VAL_54]], align 4
+// CHECK2-NEXT: %[[VAL_99:.*]] = load i32, ptr %[[VAL_59]], align 4
+// CHECK2-NEXT: %[[VAL_100:.*]] = add nsw i32 %[[VAL_98]], %[[VAL_99]]
+// CHECK2-NEXT: store i32 %[[VAL_100]], ptr %[[VAL_54]], align 4
+// CHECK2-NEXT: br label %[[VAL_87]]
+// CHECK2: omp.inner.for.end: ; preds = %[[VAL_87]]
+// CHECK2-NEXT: br label %[[VAL_101:.*]]
+// CHECK2: omp.loop.exit: ; preds = %[[VAL_93]]
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_77]])
+// CHECK2-NEXT: %[[VAL_102:.*]] = load i32, ptr %[[VAL_60]], align 4
+// CHECK2-NEXT: %[[VAL_103:.*]] = icmp ne i32 %[[VAL_102]], 0
+// CHECK2-NEXT: br i1 %[[VAL_103]], label %[[VAL_104:.*]], label %[[VAL_105:.*]]
+// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_101]]
+// CHECK2-NEXT: store i32 10, ptr %[[VAL_61]], align 4
+// CHECK2-NEXT: %[[VAL_106:.*]] = load i32, ptr %[[VAL_61]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_106]], ptr %[[VAL_51]], align 4
+// CHECK2-NEXT: br label %[[VAL_105]]
+// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_104]], %[[VAL_101]]
+// CHECK2-NEXT: %[[VAL_107:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_65]], i32 0, i32 0
+// CHECK2-NEXT: store ptr %[[VAL_53]], ptr %[[VAL_107]], align 4
+// CHECK2-NEXT: %[[VAL_108:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_77]], i32 1, i32 4, ptr %[[VAL_65]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: switch i32 %[[VAL_108]], label %[[VAL_109:.*]] [
+// CHECK2-NEXT: i32 1, label %[[VAL_110:.*]]
+// CHECK2-NEXT: i32 2, label %[[VAL_111:.*]]
+// CHECK2-NEXT: ]
+// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_105]]
+// CHECK2-NEXT: %[[VAL_112:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_113:.*]] = load float, ptr %[[VAL_112]], align 4
+// CHECK2-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_115:.*]] = load float, ptr %[[VAL_114]], align 4
+// CHECK2-NEXT: %[[VAL_116:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_117:.*]] = load float, ptr %[[VAL_116]], align 4
+// CHECK2-NEXT: %[[VAL_118:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_119:.*]] = load float, ptr %[[VAL_118]], align 4
+// CHECK2-NEXT: %[[VAL_120:.*]] = fadd float %[[VAL_113]], %[[VAL_117]]
+// CHECK2-NEXT: %[[VAL_121:.*]] = fadd float %[[VAL_115]], %[[VAL_119]]
+// CHECK2-NEXT: %[[VAL_122:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_123:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_120]], ptr %[[VAL_122]], align 4
+// CHECK2-NEXT: store float %[[VAL_121]], ptr %[[VAL_123]], align 4
+// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: br label %[[VAL_109]]
+// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_105]]
+// CHECK2-NEXT: %[[VAL_124:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_125:.*]] = load float, ptr %[[VAL_124]], align 4
+// CHECK2-NEXT: %[[VAL_126:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_127:.*]] = load float, ptr %[[VAL_126]], align 4
+// CHECK2-NEXT: call void @__atomic_load(i32 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], i32 noundef 0)
+// CHECK2-NEXT: br label %[[VAL_128:.*]]
+// CHECK2: atomic_cont: ; preds = %[[VAL_128]], %[[VAL_111]]
+// CHECK2-NEXT: %[[VAL_129:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_130:.*]] = load float, ptr %[[VAL_129]], align 4
+// CHECK2-NEXT: %[[VAL_131:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_132:.*]] = load float, ptr %[[VAL_131]], align 4
+// CHECK2-NEXT: %[[VAL_133:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_134:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_130]], ptr %[[VAL_133]], align 4
+// CHECK2-NEXT: store float %[[VAL_132]], ptr %[[VAL_134]], align 4
+// CHECK2-NEXT: %[[VAL_135:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_136:.*]] = load float, ptr %[[VAL_135]], align 4
+// CHECK2-NEXT: %[[VAL_137:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4
+// CHECK2-NEXT: %[[VAL_139:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_140:.*]] = load float, ptr %[[VAL_139]], align 4
+// CHECK2-NEXT: %[[VAL_141:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_142:.*]] = load float, ptr %[[VAL_141]], align 4
+// CHECK2-NEXT: %[[VAL_143:.*]] = fadd float %[[VAL_136]], %[[VAL_140]]
+// CHECK2-NEXT: %[[VAL_144:.*]] = fadd float %[[VAL_138]], %[[VAL_142]]
+// CHECK2-NEXT: %[[VAL_145:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_146:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_143]], ptr %[[VAL_145]], align 4
+// CHECK2-NEXT: store float %[[VAL_144]], ptr %[[VAL_146]], align 4
+// CHECK2-NEXT: %[[VAL_147:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], ptr noundef %[[VAL_67]], i32 noundef 0, i32 noundef 0)
+// CHECK2-NEXT: br i1 %[[VAL_147]], label %[[VAL_148:.*]], label %[[VAL_128]]
+// CHECK2: atomic_exit: ; preds = %[[VAL_128]]
+// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: br label %[[VAL_109]]
+// CHECK2: .omp.reduction.default: ; preds = %[[VAL_148]], %[[VAL_110]], %[[VAL_105]]
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_149:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_150:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_151:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_152:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_153:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_154:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_155:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_156:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_157:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_158:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_159:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_160:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_161:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_162:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_163:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_164:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_165:.*]] = alloca i32, align 4
+// CHECK2-NEXT: %[[VAL_166:.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT: %[[VAL_167:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_168:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: %[[VAL_169:.*]] = alloca { float, float }, align 4
+// CHECK2-NEXT: store ptr %[[VAL_170:.*]], ptr %[[VAL_149]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_171:.*]], ptr %[[VAL_150]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_172:.*]], ptr %[[VAL_151]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_173:.*]], ptr %[[VAL_152]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_174:.*]], ptr %[[VAL_153]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_175:.*]], ptr %[[VAL_154]], align 4
+// CHECK2-NEXT: %[[VAL_176:.*]] = load ptr, ptr %[[VAL_154]], align 4
+// CHECK2-NEXT: store i32 0, ptr %[[VAL_158]], align 4
+// CHECK2-NEXT: store i32 99, ptr %[[VAL_159]], align 4
+// CHECK2-NEXT: %[[VAL_177:.*]] = load i32, ptr %[[VAL_151]], align 4
+// CHECK2-NEXT: %[[VAL_178:.*]] = load i32, ptr %[[VAL_152]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_177]], ptr %[[VAL_158]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_178]], ptr %[[VAL_159]], align 4
+// CHECK2-NEXT: store i32 1, ptr %[[VAL_160]], align 4
+// CHECK2-NEXT: store i32 0, ptr %[[VAL_161]], align 4
+// CHECK2-NEXT: %[[VAL_179:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_180:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_179]], align 4
+// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_180]], align 4
+// CHECK2-NEXT: %[[VAL_181:.*]] = load ptr, ptr %[[VAL_149]], align 4
+// CHECK2-NEXT: %[[VAL_182:.*]] = load i32, ptr %[[VAL_181]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @2, i32 %[[VAL_182]], i32 34, ptr %[[VAL_161]], ptr %[[VAL_158]], ptr %[[VAL_159]], ptr %[[VAL_160]], i32 1, i32 1)
+// CHECK2-NEXT: %[[VAL_183:.*]] = load i32, ptr %[[VAL_159]], align 4
+// CHECK2-NEXT: %[[VAL_184:.*]] = icmp sgt i32 %[[VAL_183]], 99
+// CHECK2-NEXT: br i1 %[[VAL_184]], label %[[VAL_185:.*]], label %[[VAL_186:.*]]
+// CHECK2: cond.true: ; preds = %[[VAL_187:.*]]
+// CHECK2-NEXT: br label %[[VAL_188:.*]]
+// CHECK2: cond.false: ; preds = %[[VAL_187]]
+// CHECK2-NEXT: %[[VAL_189:.*]] = load i32, ptr %[[VAL_159]], align 4
+// CHECK2-NEXT: br label %[[VAL_188]]
+// CHECK2: cond.end: ; preds = %[[VAL_186]], %[[VAL_185]]
+// CHECK2-NEXT: %[[VAL_190:.*]] = phi i32 [ 99, %[[VAL_185]] ], [ %[[VAL_189]], %[[VAL_186]] ]
+// CHECK2-NEXT: store i32 %[[VAL_190]], ptr %[[VAL_159]], align 4
+// CHECK2-NEXT: %[[VAL_191:.*]] = load i32, ptr %[[VAL_158]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_191]], ptr %[[VAL_155]], align 4
+// CHECK2-NEXT: br label %[[VAL_192:.*]]
+// CHECK2: omp.inner.for.cond: ; preds = %[[VAL_193:.*]], %[[VAL_188]]
+// CHECK2-NEXT: %[[VAL_194:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_195:.*]] = load i32, ptr %[[VAL_159]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_196:.*]] = icmp sle i32 %[[VAL_194]], %[[VAL_195]]
+// CHECK2-NEXT: br i1 %[[VAL_196]], label %[[VAL_197:.*]], label %[[VAL_198:.*]]
+// CHECK2: omp.inner.for.body: ; preds = %[[VAL_192]]
+// CHECK2-NEXT: %[[VAL_199:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_200:.*]] = sdiv i32 %[[VAL_199]], 10
+// CHECK2-NEXT: %[[VAL_201:.*]] = mul nsw i32 %[[VAL_200]], 1
+// CHECK2-NEXT: %[[VAL_202:.*]] = add nsw i32 0, %[[VAL_201]]
+// CHECK2-NEXT: store i32 %[[VAL_202]], ptr %[[VAL_164]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_203:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_204:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_205:.*]] = sdiv i32 %[[VAL_204]], 10
+// CHECK2-NEXT: %[[VAL_206:.*]] = mul nsw i32 %[[VAL_205]], 10
+// CHECK2-NEXT: %[[VAL_207:.*]] = sub nsw i32 %[[VAL_203]], %[[VAL_206]]
+// CHECK2-NEXT: %[[VAL_208:.*]] = mul nsw i32 %[[VAL_207]], 1
+// CHECK2-NEXT: %[[VAL_209:.*]] = add nsw i32 0, %[[VAL_208]]
+// CHECK2-NEXT: store i32 %[[VAL_209]], ptr %[[VAL_162]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_210:.*]] = load i32, ptr %[[VAL_164]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_211:.*]] = sitofp i32 %[[VAL_210]] to float
+// CHECK2-NEXT: %[[VAL_212:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_213:.*]] = load float, ptr %[[VAL_212]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_214:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_215:.*]] = load float, ptr %[[VAL_214]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_216:.*]] = fadd float %[[VAL_213]], %[[VAL_211]]
+// CHECK2-NEXT: %[[VAL_217:.*]] = fadd float %[[VAL_215]], 0.000000e+00
+// CHECK2-NEXT: %[[VAL_218:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_219:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_216]], ptr %[[VAL_218]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: store float %[[VAL_217]], ptr %[[VAL_219]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: br label %[[VAL_220:.*]]
+// CHECK2: omp.body.continue: ; preds = %[[VAL_197]]
+// CHECK2-NEXT: br label %[[VAL_193]]
+// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_220]]
+// CHECK2-NEXT: %[[VAL_221:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_222:.*]] = add nsw i32 %[[VAL_221]], 1
+// CHECK2-NEXT: store i32 %[[VAL_222]], ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: br label %[[VAL_192]], !llvm.loop !6
+// CHECK2: omp.inner.for.end: ; preds = %[[VAL_192]]
+// CHECK2-NEXT: br label %[[VAL_223:.*]]
+// CHECK2: omp.loop.exit: ; preds = %[[VAL_198]]
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_182]])
+// CHECK2-NEXT: %[[VAL_224:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_166]], i32 0, i32 0
+// CHECK2-NEXT: store ptr %[[VAL_163]], ptr %[[VAL_224]], align 4
+// CHECK2-NEXT: %[[VAL_225:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_182]], i32 1, i32 4, ptr %[[VAL_166]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: switch i32 %[[VAL_225]], label %[[VAL_226:.*]] [
+// CHECK2-NEXT: i32 1, label %[[VAL_227:.*]]
+// CHECK2-NEXT: i32 2, label %[[VAL_228:.*]]
+// CHECK2-NEXT: ]
+// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_223]]
+// CHECK2-NEXT: %[[VAL_229:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_230:.*]] = load float, ptr %[[VAL_229]], align 4
+// CHECK2-NEXT: %[[VAL_231:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_232:.*]] = load float, ptr %[[VAL_231]], align 4
+// CHECK2-NEXT: %[[VAL_233:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_234:.*]] = load float, ptr %[[VAL_233]], align 4
+// CHECK2-NEXT: %[[VAL_235:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_236:.*]] = load float, ptr %[[VAL_235]], align 4
+// CHECK2-NEXT: %[[VAL_237:.*]] = fadd float %[[VAL_230]], %[[VAL_234]]
+// CHECK2-NEXT: %[[VAL_238:.*]] = fadd float %[[VAL_232]], %[[VAL_236]]
+// CHECK2-NEXT: %[[VAL_239:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_240:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_237]], ptr %[[VAL_239]], align 4
+// CHECK2-NEXT: store float %[[VAL_238]], ptr %[[VAL_240]], align 4
+// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: br label %[[VAL_226]]
+// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_223]]
+// CHECK2-NEXT: %[[VAL_241:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_242:.*]] = load float, ptr %[[VAL_241]], align 4
+// CHECK2-NEXT: %[[VAL_243:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_244:.*]] = load float, ptr %[[VAL_243]], align 4
+// CHECK2-NEXT: call void @__atomic_load(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], i32 noundef 0)
+// CHECK2-NEXT: br label %[[VAL_245:.*]]
+// CHECK2: atomic_cont: ; preds = %[[VAL_245]], %[[VAL_228]]
+// CHECK2-NEXT: %[[VAL_246:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_247:.*]] = load float, ptr %[[VAL_246]], align 4
+// CHECK2-NEXT: %[[VAL_248:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_249:.*]] = load float, ptr %[[VAL_248]], align 4
+// CHECK2-NEXT: %[[VAL_250:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_251:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_247]], ptr %[[VAL_250]], align 4
+// CHECK2-NEXT: store float %[[VAL_249]], ptr %[[VAL_251]], align 4
+// CHECK2-NEXT: %[[VAL_252:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_253:.*]] = load float, ptr %[[VAL_252]], align 4
+// CHECK2-NEXT: %[[VAL_254:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_255:.*]] = load float, ptr %[[VAL_254]], align 4
+// CHECK2-NEXT: %[[VAL_256:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_257:.*]] = load float, ptr %[[VAL_256]], align 4
+// CHECK2-NEXT: %[[VAL_258:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_259:.*]] = load float, ptr %[[VAL_258]], align 4
+// CHECK2-NEXT: %[[VAL_260:.*]] = fadd float %[[VAL_253]], %[[VAL_257]]
+// CHECK2-NEXT: %[[VAL_261:.*]] = fadd float %[[VAL_255]], %[[VAL_259]]
+// CHECK2-NEXT: %[[VAL_262:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_263:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_260]], ptr %[[VAL_262]], align 4
+// CHECK2-NEXT: store float %[[VAL_261]], ptr %[[VAL_263]], align 4
+// CHECK2-NEXT: %[[VAL_264:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], ptr noundef %[[VAL_168]], i32 noundef 0, i32 noundef 0)
+// CHECK2-NEXT: br i1 %[[VAL_264]], label %[[VAL_265:.*]], label %[[VAL_245]]
+// CHECK2: atomic_exit: ; preds = %[[VAL_245]]
+// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: br label %[[VAL_226]]
+// CHECK2: .omp.reduction.default: ; preds = %[[VAL_265]], %[[VAL_227]], %[[VAL_223]]
+// CHECK2-NEXT: %[[VAL_266:.*]] = load i32, ptr %[[VAL_161]], align 4
+// CHECK2-NEXT: %[[VAL_267:.*]] = icmp ne i32 %[[VAL_266]], 0
+// CHECK2-NEXT: br i1 %[[VAL_267]], label %[[VAL_268:.*]], label %[[VAL_269:.*]]
+// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_226]]
+// CHECK2-NEXT: store i32 10, ptr %[[VAL_162]], align 4
+// CHECK2-NEXT: %[[VAL_270:.*]] = load i32, ptr %[[VAL_162]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_270]], ptr %[[VAL_153]], align 4
+// CHECK2-NEXT: br label %[[VAL_269]]
+// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_268]], %[[VAL_226]]
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_271:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_272:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: store ptr %[[VAL_273:.*]], ptr %[[VAL_271]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_274:.*]], ptr %[[VAL_272]], align 4
+// CHECK2-NEXT: %[[VAL_275:.*]] = load ptr, ptr %[[VAL_271]], align 4
+// CHECK2-NEXT: %[[VAL_276:.*]] = load ptr, ptr %[[VAL_272]], align 4
+// CHECK2-NEXT: %[[VAL_277:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_276]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_278:.*]] = load ptr, ptr %[[VAL_277]], align 4
+// CHECK2-NEXT: %[[VAL_279:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_275]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_280:.*]] = load ptr, ptr %[[VAL_279]], align 4
+// CHECK2-NEXT: %[[VAL_281:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_282:.*]] = load float, ptr %[[VAL_281]], align 4
+// CHECK2-NEXT: %[[VAL_283:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_284:.*]] = load float, ptr %[[VAL_283]], align 4
+// CHECK2-NEXT: %[[VAL_285:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_278]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_286:.*]] = load float, ptr %[[VAL_285]], align 4
+// CHECK2-NEXT: %[[VAL_287:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_278]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_288:.*]] = load float, ptr %[[VAL_287]], align 4
+// CHECK2-NEXT: %[[VAL_289:.*]] = fadd float %[[VAL_282]], %[[VAL_286]]
+// CHECK2-NEXT: %[[VAL_290:.*]] = fadd float %[[VAL_284]], %[[VAL_288]]
+// CHECK2-NEXT: %[[VAL_291:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_292:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_289]], ptr %[[VAL_291]], align 4
+// CHECK2-NEXT: store float %[[VAL_290]], ptr %[[VAL_292]], align 4
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: %[[VAL_293:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: %[[VAL_294:.*]] = alloca ptr, align 4
+// CHECK2-NEXT: store ptr %[[VAL_295:.*]], ptr %[[VAL_293]], align 4
+// CHECK2-NEXT: store ptr %[[VAL_296:.*]], ptr %[[VAL_294]], align 4
+// CHECK2-NEXT: %[[VAL_297:.*]] = load ptr, ptr %[[VAL_293]], align 4
+// CHECK2-NEXT: %[[VAL_298:.*]] = load ptr, ptr %[[VAL_294]], align 4
+// CHECK2-NEXT: %[[VAL_299:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_298]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_300:.*]] = load ptr, ptr %[[VAL_299]], align 4
+// CHECK2-NEXT: %[[VAL_301:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_297]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_302:.*]] = load ptr, ptr %[[VAL_301]], align 4
+// CHECK2-NEXT: %[[VAL_303:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_304:.*]] = load float, ptr %[[VAL_303]], align 4
+// CHECK2-NEXT: %[[VAL_305:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_306:.*]] = load float, ptr %[[VAL_305]], align 4
+// CHECK2-NEXT: %[[VAL_307:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_300]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_308:.*]] = load float, ptr %[[VAL_307]], align 4
+// CHECK2-NEXT: %[[VAL_309:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_300]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_310:.*]] = load float, ptr %[[VAL_309]], align 4
+// CHECK2-NEXT: %[[VAL_311:.*]] = fadd float %[[VAL_304]], %[[VAL_308]]
+// CHECK2-NEXT: %[[VAL_312:.*]] = fadd float %[[VAL_306]], %[[VAL_310]]
+// CHECK2-NEXT: %[[VAL_313:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_314:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_311]], ptr %[[VAL_313]], align 4
+// CHECK2-NEXT: store float %[[VAL_312]], ptr %[[VAL_314]], align 4
+// CHECK2-NEXT: ret void
\ No newline at end of file
>From 5446f4414e9e8b700992f561e2242b688914092e Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 20 Mar 2024 17:05:01 +0000
Subject: [PATCH 15/18] Rebased
---
clang/test/OpenMP/reduction_complex.c | 151 +++++++++---------
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 7 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 93 +++++------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 40 ++---
4 files changed, 141 insertions(+), 150 deletions(-)
diff --git a/clang/test/OpenMP/reduction_complex.c b/clang/test/OpenMP/reduction_complex.c
index cbc5b14087239d..cb7c20926d4ea2 100644
--- a/clang/test/OpenMP/reduction_complex.c
+++ b/clang/test/OpenMP/reduction_complex.c
@@ -171,7 +171,7 @@ int foo() {
// CHECK: omp.inner.for.end: ; preds = %[[VAL_55]]
// CHECK-NEXT: br label %[[VAL_91:.*]]
// CHECK: omp.loop.exit: ; preds = %[[VAL_60]]
-// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @3, i32 %[[VAL_45]])
+// CHECK-NEXT: call void @__kmpc_distribute_static_fini(ptr @2, i32 %[[VAL_45]])
// CHECK-NEXT: %[[VAL_92:.*]] = load i32, ptr %[[VAL_29]], align 4
// CHECK-NEXT: %[[VAL_93:.*]] = icmp ne i32 %[[VAL_92]], 0
// CHECK-NEXT: br i1 %[[VAL_93]], label %[[VAL_94:.*]], label %[[VAL_95:.*]]
@@ -500,7 +500,7 @@ int foo() {
// CHECK1: omp.inner.for.end: ; preds = %[[VAL_87]]
// CHECK1-NEXT: br label %[[VAL_103:.*]]
// CHECK1: omp.loop.exit: ; preds = %[[VAL_93]]
-// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_77]])
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_77]])
// CHECK1-NEXT: %[[VAL_104:.*]] = load i32, ptr %[[VAL_60]], align 4
// CHECK1-NEXT: %[[VAL_105:.*]] = icmp ne i32 %[[VAL_104]], 0
// CHECK1-NEXT: br i1 %[[VAL_105]], label %[[VAL_106:.*]], label %[[VAL_107:.*]]
@@ -764,7 +764,7 @@ int foo() {
// CHECK2: omp.inner.for.end: ; preds = %[[VAL_87]]
// CHECK2-NEXT: br label %[[VAL_101:.*]]
// CHECK2: omp.loop.exit: ; preds = %[[VAL_93]]
-// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_77]])
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_77]])
// CHECK2-NEXT: %[[VAL_102:.*]] = load i32, ptr %[[VAL_60]], align 4
// CHECK2-NEXT: %[[VAL_103:.*]] = icmp ne i32 %[[VAL_102]], 0
// CHECK2-NEXT: br i1 %[[VAL_103]], label %[[VAL_104:.*]], label %[[VAL_105:.*]]
@@ -923,92 +923,91 @@ int foo() {
// CHECK2-NEXT: %[[VAL_214:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
// CHECK2-NEXT: %[[VAL_215:.*]] = load float, ptr %[[VAL_214]], align 4, !llvm.access.group !5
// CHECK2-NEXT: %[[VAL_216:.*]] = fadd float %[[VAL_213]], %[[VAL_211]]
-// CHECK2-NEXT: %[[VAL_217:.*]] = fadd float %[[VAL_215]], 0.000000e+00
-// CHECK2-NEXT: %[[VAL_218:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_219:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_216]], ptr %[[VAL_218]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: store float %[[VAL_217]], ptr %[[VAL_219]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: br label %[[VAL_220:.*]]
+// CHECK2-NEXT: %[[VAL_217:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_218:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_216]], ptr %[[VAL_217]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: store float %[[VAL_215]], ptr %[[VAL_218]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: br label %[[VAL_219:.*]]
// CHECK2: omp.body.continue: ; preds = %[[VAL_197]]
// CHECK2-NEXT: br label %[[VAL_193]]
-// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_220]]
-// CHECK2-NEXT: %[[VAL_221:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_222:.*]] = add nsw i32 %[[VAL_221]], 1
-// CHECK2-NEXT: store i32 %[[VAL_222]], ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_219]]
+// CHECK2-NEXT: %[[VAL_220:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
+// CHECK2-NEXT: %[[VAL_221:.*]] = add nsw i32 %[[VAL_220]], 1
+// CHECK2-NEXT: store i32 %[[VAL_221]], ptr %[[VAL_155]], align 4, !llvm.access.group !5
// CHECK2-NEXT: br label %[[VAL_192]], !llvm.loop !6
// CHECK2: omp.inner.for.end: ; preds = %[[VAL_192]]
-// CHECK2-NEXT: br label %[[VAL_223:.*]]
+// CHECK2-NEXT: br label %[[VAL_222:.*]]
// CHECK2: omp.loop.exit: ; preds = %[[VAL_198]]
// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_182]])
-// CHECK2-NEXT: %[[VAL_224:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_166]], i32 0, i32 0
-// CHECK2-NEXT: store ptr %[[VAL_163]], ptr %[[VAL_224]], align 4
-// CHECK2-NEXT: %[[VAL_225:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_182]], i32 1, i32 4, ptr %[[VAL_166]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: switch i32 %[[VAL_225]], label %[[VAL_226:.*]] [
-// CHECK2-NEXT: i32 1, label %[[VAL_227:.*]]
-// CHECK2-NEXT: i32 2, label %[[VAL_228:.*]]
+// CHECK2-NEXT: %[[VAL_223:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_166]], i32 0, i32 0
+// CHECK2-NEXT: store ptr %[[VAL_163]], ptr %[[VAL_223]], align 4
+// CHECK2-NEXT: %[[VAL_224:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_182]], i32 1, i32 4, ptr %[[VAL_166]], ptr @__omp_offloading_1030b_4868a89__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK2-NEXT: switch i32 %[[VAL_224]], label %[[VAL_225:.*]] [
+// CHECK2-NEXT: i32 1, label %[[VAL_226:.*]]
+// CHECK2-NEXT: i32 2, label %[[VAL_227:.*]]
// CHECK2-NEXT: ]
-// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_223]]
-// CHECK2-NEXT: %[[VAL_229:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_230:.*]] = load float, ptr %[[VAL_229]], align 4
-// CHECK2-NEXT: %[[VAL_231:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_232:.*]] = load float, ptr %[[VAL_231]], align 4
-// CHECK2-NEXT: %[[VAL_233:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_234:.*]] = load float, ptr %[[VAL_233]], align 4
-// CHECK2-NEXT: %[[VAL_235:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_236:.*]] = load float, ptr %[[VAL_235]], align 4
-// CHECK2-NEXT: %[[VAL_237:.*]] = fadd float %[[VAL_230]], %[[VAL_234]]
-// CHECK2-NEXT: %[[VAL_238:.*]] = fadd float %[[VAL_232]], %[[VAL_236]]
-// CHECK2-NEXT: %[[VAL_239:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_240:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
+// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_222]]
+// CHECK2-NEXT: %[[VAL_228:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_229:.*]] = load float, ptr %[[VAL_228]], align 4
+// CHECK2-NEXT: %[[VAL_230:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_231:.*]] = load float, ptr %[[VAL_230]], align 4
+// CHECK2-NEXT: %[[VAL_232:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_233:.*]] = load float, ptr %[[VAL_232]], align 4
+// CHECK2-NEXT: %[[VAL_234:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_235:.*]] = load float, ptr %[[VAL_234]], align 4
+// CHECK2-NEXT: %[[VAL_236:.*]] = fadd float %[[VAL_229]], %[[VAL_233]]
+// CHECK2-NEXT: %[[VAL_237:.*]] = fadd float %[[VAL_231]], %[[VAL_235]]
+// CHECK2-NEXT: %[[VAL_238:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_239:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_236]], ptr %[[VAL_238]], align 4
// CHECK2-NEXT: store float %[[VAL_237]], ptr %[[VAL_239]], align 4
-// CHECK2-NEXT: store float %[[VAL_238]], ptr %[[VAL_240]], align 4
// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_226]]
-// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_223]]
-// CHECK2-NEXT: %[[VAL_241:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_242:.*]] = load float, ptr %[[VAL_241]], align 4
-// CHECK2-NEXT: %[[VAL_243:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_244:.*]] = load float, ptr %[[VAL_243]], align 4
+// CHECK2-NEXT: br label %[[VAL_225]]
+// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_222]]
+// CHECK2-NEXT: %[[VAL_240:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_241:.*]] = load float, ptr %[[VAL_240]], align 4
+// CHECK2-NEXT: %[[VAL_242:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_243:.*]] = load float, ptr %[[VAL_242]], align 4
// CHECK2-NEXT: call void @__atomic_load(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], i32 noundef 0)
-// CHECK2-NEXT: br label %[[VAL_245:.*]]
-// CHECK2: atomic_cont: ; preds = %[[VAL_245]], %[[VAL_228]]
-// CHECK2-NEXT: %[[VAL_246:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_247:.*]] = load float, ptr %[[VAL_246]], align 4
-// CHECK2-NEXT: %[[VAL_248:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_249:.*]] = load float, ptr %[[VAL_248]], align 4
-// CHECK2-NEXT: %[[VAL_250:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_251:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_247]], ptr %[[VAL_250]], align 4
-// CHECK2-NEXT: store float %[[VAL_249]], ptr %[[VAL_251]], align 4
-// CHECK2-NEXT: %[[VAL_252:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_253:.*]] = load float, ptr %[[VAL_252]], align 4
-// CHECK2-NEXT: %[[VAL_254:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_255:.*]] = load float, ptr %[[VAL_254]], align 4
-// CHECK2-NEXT: %[[VAL_256:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_257:.*]] = load float, ptr %[[VAL_256]], align 4
-// CHECK2-NEXT: %[[VAL_258:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_259:.*]] = load float, ptr %[[VAL_258]], align 4
-// CHECK2-NEXT: %[[VAL_260:.*]] = fadd float %[[VAL_253]], %[[VAL_257]]
-// CHECK2-NEXT: %[[VAL_261:.*]] = fadd float %[[VAL_255]], %[[VAL_259]]
-// CHECK2-NEXT: %[[VAL_262:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_263:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 1
+// CHECK2-NEXT: br label %[[VAL_244:.*]]
+// CHECK2: atomic_cont: ; preds = %[[VAL_244]], %[[VAL_227]]
+// CHECK2-NEXT: %[[VAL_245:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_246:.*]] = load float, ptr %[[VAL_245]], align 4
+// CHECK2-NEXT: %[[VAL_247:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_248:.*]] = load float, ptr %[[VAL_247]], align 4
+// CHECK2-NEXT: %[[VAL_249:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_250:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_246]], ptr %[[VAL_249]], align 4
+// CHECK2-NEXT: store float %[[VAL_248]], ptr %[[VAL_250]], align 4
+// CHECK2-NEXT: %[[VAL_251:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_252:.*]] = load float, ptr %[[VAL_251]], align 4
+// CHECK2-NEXT: %[[VAL_253:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_254:.*]] = load float, ptr %[[VAL_253]], align 4
+// CHECK2-NEXT: %[[VAL_255:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_256:.*]] = load float, ptr %[[VAL_255]], align 4
+// CHECK2-NEXT: %[[VAL_257:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
+// CHECK2-NEXT: %[[VAL_258:.*]] = load float, ptr %[[VAL_257]], align 4
+// CHECK2-NEXT: %[[VAL_259:.*]] = fadd float %[[VAL_252]], %[[VAL_256]]
+// CHECK2-NEXT: %[[VAL_260:.*]] = fadd float %[[VAL_254]], %[[VAL_258]]
+// CHECK2-NEXT: %[[VAL_261:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 0
+// CHECK2-NEXT: %[[VAL_262:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 1
+// CHECK2-NEXT: store float %[[VAL_259]], ptr %[[VAL_261]], align 4
// CHECK2-NEXT: store float %[[VAL_260]], ptr %[[VAL_262]], align 4
-// CHECK2-NEXT: store float %[[VAL_261]], ptr %[[VAL_263]], align 4
-// CHECK2-NEXT: %[[VAL_264:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], ptr noundef %[[VAL_168]], i32 noundef 0, i32 noundef 0)
-// CHECK2-NEXT: br i1 %[[VAL_264]], label %[[VAL_265:.*]], label %[[VAL_245]]
-// CHECK2: atomic_exit: ; preds = %[[VAL_245]]
+// CHECK2-NEXT: %[[VAL_263:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], ptr noundef %[[VAL_168]], i32 noundef 0, i32 noundef 0)
+// CHECK2-NEXT: br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_244]]
+// CHECK2: atomic_exit: ; preds = %[[VAL_244]]
// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_226]]
-// CHECK2: .omp.reduction.default: ; preds = %[[VAL_265]], %[[VAL_227]], %[[VAL_223]]
-// CHECK2-NEXT: %[[VAL_266:.*]] = load i32, ptr %[[VAL_161]], align 4
-// CHECK2-NEXT: %[[VAL_267:.*]] = icmp ne i32 %[[VAL_266]], 0
-// CHECK2-NEXT: br i1 %[[VAL_267]], label %[[VAL_268:.*]], label %[[VAL_269:.*]]
-// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_226]]
+// CHECK2-NEXT: br label %[[VAL_225]]
+// CHECK2: .omp.reduction.default: ; preds = %[[VAL_264]], %[[VAL_226]], %[[VAL_222]]
+// CHECK2-NEXT: %[[VAL_265:.*]] = load i32, ptr %[[VAL_161]], align 4
+// CHECK2-NEXT: %[[VAL_266:.*]] = icmp ne i32 %[[VAL_265]], 0
+// CHECK2-NEXT: br i1 %[[VAL_266]], label %[[VAL_267:.*]], label %[[VAL_268:.*]]
+// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_225]]
// CHECK2-NEXT: store i32 10, ptr %[[VAL_162]], align 4
-// CHECK2-NEXT: %[[VAL_270:.*]] = load i32, ptr %[[VAL_162]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_270]], ptr %[[VAL_153]], align 4
-// CHECK2-NEXT: br label %[[VAL_269]]
-// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_268]], %[[VAL_226]]
+// CHECK2-NEXT: %[[VAL_269:.*]] = load i32, ptr %[[VAL_162]], align 4
+// CHECK2-NEXT: store i32 %[[VAL_269]], ptr %[[VAL_153]], align 4
+// CHECK2-NEXT: br label %[[VAL_268]]
+// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_267]], %[[VAL_225]]
// CHECK2-NEXT: ret void
//
//
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 6383bc1775a51e..ec5ddb117d5828 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1913,9 +1913,10 @@ class OpenMPIRBuilder {
InsertPointTy createReductions(const LocationDescription &Loc,
InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait = false, bool IsByRef = false,
- bool IsTeamsReduction = false,
- bool HasDistribute = false);
+ bool IsNoWait = false, bool IsByRef = false);
+
+ ///}
+
/// Return the insertion point used by the underlying IRBuilder.
InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f538296804b9e9..cf49d7039c0e96 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3507,61 +3507,24 @@ static Function *getFreshReductionFunc(Module &M) {
".omp.reduction.func", &M);
}
-static void populateReductionFunction(
- Function *ReductionFunc,
- ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- IRBuilder<> &Builder) {
- Module *Module = ReductionFunc->getParent();
- BasicBlock *ReductionFuncBlock =
- BasicBlock::Create(Module->getContext(), "", ReductionFunc);
- Builder.SetInsertPoint(ReductionFuncBlock);
- Value *LHSArrayPtr = nullptr;
- Value *RHSArrayPtr = nullptr;
- LHSArrayPtr = ReductionFunc->getArg(0);
- RHSArrayPtr = ReductionFunc->getArg(1);
-
- unsigned NumReductions = ReductionInfos.size();
- Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
-
- for (auto En : enumerate(ReductionInfos)) {
- const OpenMPIRBuilder::ReductionInfo &RI = En.value();
- Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, LHSArrayPtr, 0, En.index());
- Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
- Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- LHSI8Ptr, RI.Variable->getType());
- Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
- Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, RHSArrayPtr, 0, En.index());
- Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
- Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- RHSI8Ptr, RI.PrivateVariable->getType());
- Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
- Value *Reduced;
- Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
- if (!Builder.GetInsertBlock())
- return;
- Builder.CreateStore(Reduced, LHSPtr);
- }
- Builder.CreateRetVoid();
-}
-
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const LocationDescription &Loc, InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef,
- bool IsTeamsReduction, bool HasDistribute) {
- if (Config.isGPU())
- return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
- IsNoWait, IsTeamsReduction, HasDistribute);
-
- checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
+ ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
+ for (const ReductionInfo &RI : ReductionInfos) {
+ (void)RI;
+ assert(RI.Variable && "expected non-null variable");
+ assert(RI.PrivateVariable && "expected non-null private variable");
+ assert(RI.ReductionGen && "expected non-null reduction generator callback");
+ assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
+ "expected variables and their private equivalents to have the same "
+ "type");
+ assert(RI.Variable->getType()->isPointerTy() &&
+ "expected variables to be pointers");
+ }
if (!updateToLocation(Loc))
return InsertPointTy();
- if (ReductionInfos.size() == 0)
- return Builder.saveIP();
-
BasicBlock *InsertBlock = Loc.IP.getBlock();
BasicBlock *ContinuationBlock =
InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -3602,7 +3565,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const DataLayout &DL = Module->getDataLayout();
unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
- Function *ReductionFunc = getFreshReductionFunc(M);
+ Function *ReductionFunc = getFreshReductionFunc(*Module);
Value *Lock = getOMPCriticalRegionLock(".reduction");
Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -3681,7 +3644,35 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
// Populate the outlined reduction function using the elementwise reduction
// function. Partial values are extracted from the type-erased array of
// pointers to private variables.
- populateReductionFunction(ReductionFunc, ReductionInfos, Builder);
+ BasicBlock *ReductionFuncBlock =
+ BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+ Builder.SetInsertPoint(ReductionFuncBlock);
+ Value *LHSArrayPtr = ReductionFunc->getArg(0);
+ Value *RHSArrayPtr = ReductionFunc->getArg(1);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const ReductionInfo &RI = En.value();
+ Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, LHSArrayPtr, 0, En.index());
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, RHSArrayPtr, 0, En.index());
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr =
+ Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+ if (!Builder.GetInsertBlock())
+ return InsertPointTy();
+ // store is inside of the reduction region when using by-ref
+ if (!IsByRef)
+ Builder.CreateStore(Reduced, LHSPtr);
+ }
+ Builder.CreateRetVoid();
+
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 7657f28c9539a0..e9c231bf499b11 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3131,26 +3131,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
moduleTranslation);
return failure();
})
- .Case(
- "omp.requires",
- [&](Attribute attr) {
- if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
- using Requires = omp::ClauseRequires;
- Requires flags = requiresAttr.getValue();
- llvm::OpenMPIRBuilderConfig &config =
- moduleTranslation.getOpenMPBuilder()->Config;
- config.setHasRequiresReverseOffload(
- bitEnumContainsAll(flags, Requires::reverse_offload));
- config.setHasRequiresUnifiedAddress(
- bitEnumContainsAll(flags, Requires::unified_address));
- config.setHasRequiresUnifiedSharedMemory(
- bitEnumContainsAll(flags, Requires::unified_shared_memory));
- config.setHasRequiresDynamicAllocators(
- bitEnumContainsAll(flags, Requires::dynamic_allocators));
- return convertRequiresAttr(*op, requiresAttr, moduleTranslation);
- }
- return failure();
- })
+ .Case("omp.requires",
+ [&](Attribute attr) {
+ if (auto requiresAttr =
+ attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+ using Requires = omp::ClauseRequires;
+ Requires flags = requiresAttr.getValue();
+ llvm::OpenMPIRBuilderConfig &config =
+ moduleTranslation.getOpenMPBuilder()->Config;
+ config.setHasRequiresReverseOffload(
+ bitEnumContainsAll(flags, Requires::reverse_offload));
+ config.setHasRequiresUnifiedAddress(
+ bitEnumContainsAll(flags, Requires::unified_address));
+ config.setHasRequiresUnifiedSharedMemory(
+ bitEnumContainsAll(flags, Requires::unified_shared_memory));
+ config.setHasRequiresDynamicAllocators(
+ bitEnumContainsAll(flags, Requires::dynamic_allocators));
+ return success();
+ }
+ return failure();
+ })
.Default([](Attribute) {
// Fall through for omp attributes that do not require lowering.
return success();
>From 16d6fe80580723b89744973643b1b240ec3ae891 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 27 Mar 2024 17:01:27 +0000
Subject: [PATCH 16/18] Addressed reviewer comments.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 12 +-
clang/test/OpenMP/reduction_complex.c | 977 +-----------------
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 34 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 28 +-
.../Frontend/OpenMPIRBuilderTest.cpp | 12 +-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 4 +-
6 files changed, 46 insertions(+), 1021 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 919cc048ea6d52..92b8c60d1ebc1a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1685,7 +1685,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Type *ElementType;
llvm::Value *Variable;
llvm::Value *PrivateVariable;
- llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+ llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
ElementType = CGF.ConvertTypeForMem(Private->getType());
const auto *RHSVar =
cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
@@ -1693,16 +1693,16 @@ void CGOpenMPRuntimeGPU::emitReduction(
const auto *LHSVar =
cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
Variable = CGF.GetAddrOfLocalVar(LHSVar).getPointer();
- llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+ llvm::OpenMPIRBuilder::EvaluationKind EvalKind;
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Scalar;
break;
case TEK_Complex:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Complex;
break;
case TEK_Aggregate:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+ EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Aggregate;
break;
}
auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
@@ -1736,7 +1736,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
- DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+ DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
RTLoc));
return;
diff --git a/clang/test/OpenMP/reduction_complex.c b/clang/test/OpenMP/reduction_complex.c
index cb7c20926d4ea2..e00caa8f90fdf7 100644
--- a/clang/test/OpenMP/reduction_complex.c
+++ b/clang/test/OpenMP/reduction_complex.c
@@ -10,18 +10,6 @@
// RUN: -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
// RUN: -o - | FileCheck %s --check-prefix CHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ \
-// RUN: -triple powerpc64le-unknown-unknown -DDIAG\
-// RUN: -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
-// RUN: %s -o - | FileCheck %s \
-// RUN: --check-prefix=CHECK1
-
-// RUN: %clang_cc1 -verify -fopenmp -x c++ \
-// RUN: -triple i386-unknown-unknown \
-// RUN: -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
-// RUN: %s -o - | FileCheck %s \
-// RUN: --check-prefix=CHECK2
-
// expected-no-diagnostics
int foo() {
int i;
@@ -36,177 +24,6 @@ int foo() {
return 0;
}
-// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
-// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT: entry:
-// CHECK-NEXT: %[[VAL_0:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_1:.*]] = alloca i64, align 8
-// CHECK-NEXT: %[[VAL_2:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_3:.*]] = alloca i64, align 8
-// CHECK-NEXT: %[[VAL_4:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_5:.*]] = alloca i32, align 4
-// CHECK-NEXT: store ptr %[[VAL_6:.*]], ptr %[[VAL_0]], align 8
-// CHECK-NEXT: store i64 %[[VAL_7:.*]], ptr %[[VAL_1]], align 8
-// CHECK-NEXT: store ptr %[[VAL_8:.*]], ptr %[[VAL_2]], align 8
-// CHECK-NEXT: %[[VAL_9:.*]] = load ptr, ptr %[[VAL_2]], align 8
-// CHECK-NEXT: %[[VAL_10:.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_kernel_environment, ptr %[[VAL_6]])
-// CHECK-NEXT: %[[VAL_11:.*]] = icmp eq i32 %[[VAL_10]], -1
-// CHECK-NEXT: br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
-// CHECK: user_code.entry: ; preds = %[[VAL_14:.*]]
-// CHECK-NEXT: %[[VAL_15:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK-NEXT: %[[VAL_16:.*]] = load i32, ptr %[[VAL_1]], align 4
-// CHECK-NEXT: store i32 %[[VAL_16]], ptr %[[VAL_3]], align 4
-// CHECK-NEXT: %[[VAL_17:.*]] = load i64, ptr %[[VAL_3]], align 8
-// CHECK-NEXT: store i32 0, ptr %[[VAL_4]], align 4
-// CHECK-NEXT: store i32 %[[VAL_15]], ptr %[[VAL_5]], align 4
-// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined(ptr %[[VAL_5]], ptr %[[VAL_4]], i64 %[[VAL_17]], ptr %[[VAL_9]]) #2
-// CHECK-NEXT: call void @__kmpc_target_deinit()
-// CHECK-NEXT: ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT: entry:
-// CHECK-NEXT: %[[VAL_18:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_19:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_20:.*]] = alloca i64, align 8
-// CHECK-NEXT: %[[VAL_21:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_22:.*]] = alloca { float, float }, align 4
-// CHECK-NEXT: %[[VAL_23:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_24:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_25:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_26:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_27:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_28:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_29:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_30:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_31:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_32:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_33:.*]] = alloca i64, align 8
-// CHECK-NEXT: %[[VAL_34:.*]] = alloca [4 x ptr], align 8
-// CHECK-NEXT: %[[VAL_35:.*]] = alloca [1 x ptr], align 8
-// CHECK-NEXT: store ptr %[[VAL_36:.*]], ptr %[[VAL_18]], align 8
-// CHECK-NEXT: store ptr %[[VAL_37:.*]], ptr %[[VAL_19]], align 8
-// CHECK-NEXT: store i64 %[[VAL_38:.*]], ptr %[[VAL_20]], align 8
-// CHECK-NEXT: store ptr %[[VAL_39:.*]], ptr %[[VAL_21]], align 8
-// CHECK-NEXT: %[[VAL_40:.*]] = load ptr, ptr %[[VAL_21]], align 8
-// CHECK-NEXT: %[[VAL_41:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 0
-// CHECK-NEXT: %[[VAL_42:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 1
-// CHECK-NEXT: store float 0.000000e+00, ptr %[[VAL_41]], align 4
-// CHECK-NEXT: store float 0.000000e+00, ptr %[[VAL_42]], align 4
-// CHECK-NEXT: store i32 0, ptr %[[VAL_26]], align 4
-// CHECK-NEXT: store i32 99, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: store i32 1, ptr %[[VAL_28]], align 4
-// CHECK-NEXT: store i32 0, ptr %[[VAL_29]], align 4
-// CHECK-NEXT: %[[VAL_43:.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-// CHECK-NEXT: %[[VAL_44:.*]] = load ptr, ptr %[[VAL_18]], align 8
-// CHECK-NEXT: %[[VAL_45:.*]] = load i32, ptr %[[VAL_44]], align 4
-// CHECK-NEXT: call void @__kmpc_distribute_static_init_4(ptr @2, i32 %[[VAL_45]], i32 91, ptr %[[VAL_29]], ptr %[[VAL_26]], ptr %[[VAL_27]], ptr %[[VAL_28]], i32 1, i32 %[[VAL_43]])
-// CHECK-NEXT: %[[VAL_46:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_47:.*]] = icmp sgt i32 %[[VAL_46]], 99
-// CHECK-NEXT: br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_49:.*]]
-// CHECK: cond.true: ; preds = %[[VAL_50:.*]]
-// CHECK-NEXT: br label %[[VAL_51:.*]]
-// CHECK: cond.false: ; preds = %[[VAL_50]]
-// CHECK-NEXT: %[[VAL_52:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: br label %[[VAL_51]]
-// CHECK: cond.end: ; preds = %[[VAL_49]], %[[VAL_48]]
-// CHECK-NEXT: %[[VAL_53:.*]] = phi i32 [ 99, %[[VAL_48]] ], [ %[[VAL_52]], %[[VAL_49]] ]
-// CHECK-NEXT: store i32 %[[VAL_53]], ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_54:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK-NEXT: store i32 %[[VAL_54]], ptr %[[VAL_23]], align 4
-// CHECK-NEXT: br label %[[VAL_55:.*]]
-// CHECK: omp.inner.for.cond: ; preds = %[[VAL_56:.*]], %[[VAL_51]]
-// CHECK-NEXT: %[[VAL_57:.*]] = load i32, ptr %[[VAL_23]], align 4
-// CHECK-NEXT: %[[VAL_58:.*]] = icmp slt i32 %[[VAL_57]], 100
-// CHECK-NEXT: br i1 %[[VAL_58]], label %[[VAL_59:.*]], label %[[VAL_60:.*]]
-// CHECK: omp.inner.for.body: ; preds = %[[VAL_55]]
-// CHECK-NEXT: %[[VAL_61:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK-NEXT: %[[VAL_62:.*]] = zext i32 %[[VAL_61]] to i64
-// CHECK-NEXT: %[[VAL_63:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_64:.*]] = zext i32 %[[VAL_63]] to i64
-// CHECK-NEXT: %[[VAL_65:.*]] = load i32, ptr %[[VAL_30]], align 4
-// CHECK-NEXT: store i32 %[[VAL_65]], ptr %[[VAL_33]], align 4
-// CHECK-NEXT: %[[VAL_66:.*]] = load i64, ptr %[[VAL_33]], align 8
-// CHECK-NEXT: %[[VAL_67:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 0
-// CHECK-NEXT: %[[VAL_68:.*]] = inttoptr i64 %[[VAL_62]] to ptr
-// CHECK-NEXT: store ptr %[[VAL_68]], ptr %[[VAL_67]], align 8
-// CHECK-NEXT: %[[VAL_69:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 1
-// CHECK-NEXT: %[[VAL_70:.*]] = inttoptr i64 %[[VAL_64]] to ptr
-// CHECK-NEXT: store ptr %[[VAL_70]], ptr %[[VAL_69]], align 8
-// CHECK-NEXT: %[[VAL_71:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 2
-// CHECK-NEXT: %[[VAL_72:.*]] = inttoptr i64 %[[VAL_66]] to ptr
-// CHECK-NEXT: store ptr %[[VAL_72]], ptr %[[VAL_71]], align 8
-// CHECK-NEXT: %[[VAL_73:.*]] = getelementptr inbounds [4 x ptr], ptr %[[VAL_34]], i64 0, i64 3
-// CHECK-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_73]], align 8
-// CHECK-NEXT: call void @__kmpc_parallel_51(ptr @1, i32 %[[VAL_45]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined_omp_outlined, ptr null, ptr %[[VAL_34]], i64 4)
-// CHECK-NEXT: br label %[[VAL_74:.*]]
-// CHECK: omp.inner.for.inc: ; preds = %[[VAL_59]]
-// CHECK-NEXT: %[[VAL_75:.*]] = load i32, ptr %[[VAL_23]], align 4
-// CHECK-NEXT: %[[VAL_76:.*]] = load i32, ptr %[[VAL_28]], align 4
-// CHECK-NEXT: %[[VAL_77:.*]] = add nsw i32 %[[VAL_75]], %[[VAL_76]]
-// CHECK-NEXT: store i32 %[[VAL_77]], ptr %[[VAL_23]], align 4
-// CHECK-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK-NEXT: %[[VAL_79:.*]] = load i32, ptr %[[VAL_28]], align 4
-// CHECK-NEXT: %[[VAL_80:.*]] = add nsw i32 %[[VAL_78]], %[[VAL_79]]
-// CHECK-NEXT: store i32 %[[VAL_80]], ptr %[[VAL_26]], align 4
-// CHECK-NEXT: %[[VAL_81:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_82:.*]] = load i32, ptr %[[VAL_28]], align 4
-// CHECK-NEXT: %[[VAL_83:.*]] = add nsw i32 %[[VAL_81]], %[[VAL_82]]
-// CHECK-NEXT: store i32 %[[VAL_83]], ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_85:.*]] = icmp sgt i32 %[[VAL_84]], 99
-// CHECK-NEXT: br i1 %[[VAL_85]], label %[[VAL_86:.*]], label %[[VAL_87:.*]]
-// CHECK: cond.true9: ; preds = %[[VAL_74]]
-// CHECK-NEXT: br label %[[VAL_56]]
-// CHECK: cond.false10: ; preds = %[[VAL_74]]
-// CHECK-NEXT: %[[VAL_88:.*]] = load i32, ptr %[[VAL_27]], align 4
-// CHECK-NEXT: br label %[[VAL_56]]
-// CHECK: cond.end11: ; preds = %[[VAL_87]], %[[VAL_86]]
-// CHECK-NEXT: %[[VAL_89:.*]] = phi i32 [ 99, %[[VAL_86]] ], [ %[[VAL_88]], %[[VAL_87]] ]
-// CHECK-NEXT: store i32 %[[VAL_89]], ptr %[[VAL_27]], align 4
-// CHECK-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_26]], align 4
-// CHECK-NEXT: store i32 %[[VAL_90]], ptr %[[VAL_23]], align 4
-// CHECK-NEXT: br label %[[VAL_55]]
-// CHECK: omp.inner.for.end: ; preds = %[[VAL_55]]
-// CHECK-NEXT: br label %[[VAL_91:.*]]
-// CHECK: omp.loop.exit: ; preds = %[[VAL_60]]
-// CHECK-NEXT: call void @__kmpc_distribute_static_fini(ptr @2, i32 %[[VAL_45]])
-// CHECK-NEXT: %[[VAL_92:.*]] = load i32, ptr %[[VAL_29]], align 4
-// CHECK-NEXT: %[[VAL_93:.*]] = icmp ne i32 %[[VAL_92]], 0
-// CHECK-NEXT: br i1 %[[VAL_93]], label %[[VAL_94:.*]], label %[[VAL_95:.*]]
-// CHECK: .omp.lastprivate.then: ; preds = %[[VAL_91]]
-// CHECK-NEXT: store i32 10, ptr %[[VAL_30]], align 4
-// CHECK-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_30]], align 4
-// CHECK-NEXT: store i32 %[[VAL_96]], ptr %[[VAL_20]], align 4
-// CHECK-NEXT: br label %[[VAL_95]]
-// CHECK: .omp.lastprivate.done: ; preds = %[[VAL_94]], %[[VAL_91]]
-// CHECK-NEXT: %[[VAL_97:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_35]], i64 0, i64 0
-// CHECK-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_97]], align 8
-// CHECK-NEXT: %[[VAL_98:.*]]"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
-// CHECK-NEXT: %[[VAL_99:.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @1, ptr %[[VAL_98]]"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, i64 8, ptr %[[VAL_35]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK-NEXT: %[[VAL_100:.*]] = icmp eq i32 %[[VAL_99]], 1
-// CHECK-NEXT: br i1 %[[VAL_100]], label %[[VAL_101:.*]], label %[[VAL_102:.*]]
-// CHECK: .omp.reduction.then: ; preds = %[[VAL_95]]
-// CHECK-NEXT: %[[VAL_103:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 0
-// CHECK-NEXT: %[[VAL_104:.*]] = load float, ptr %[[VAL_103]], align 4
-// CHECK-NEXT: %[[VAL_105:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 1
-// CHECK-NEXT: %[[VAL_106:.*]] = load float, ptr %[[VAL_105]], align 4
-// CHECK-NEXT: %[[VAL_107:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 0
-// CHECK-NEXT: %[[VAL_108:.*]] = load float, ptr %[[VAL_107]], align 4
-// CHECK-NEXT: %[[VAL_109:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_22]], i32 0, i32 1
-// CHECK-NEXT: %[[VAL_110:.*]] = load float, ptr %[[VAL_109]], align 4
-// CHECK-NEXT: %[[VAL_111:.*]] = fadd float %[[VAL_104]], %[[VAL_108]]
-// CHECK-NEXT: %[[VAL_112:.*]] = fadd float %[[VAL_106]], %[[VAL_110]]
-// CHECK-NEXT: %[[VAL_113:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 0
-// CHECK-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_40]], i32 0, i32 1
-// CHECK-NEXT: store float %[[VAL_111]], ptr %[[VAL_113]], align 4
-// CHECK-NEXT: store float %[[VAL_112]], ptr %[[VAL_114]], align 4
-// CHECK-NEXT: br label %[[VAL_102]]
-// CHECK: .omp.reduction.done: ; preds = %[[VAL_101]], %[[VAL_95]]
-// CHECK-NEXT: ret void
-//
-//
// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
@@ -250,7 +67,7 @@ int foo() {
// CHECK-NEXT: %[[VAL_263:.*]] = or i1 %[[VAL_262]], %[[VAL_261]]
// CHECK-NEXT: br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_265:.*]]
// CHECK: then: ; preds = %[[VAL_266:.*]]
-// CHECK-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
+// CHECK-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l{{[0-9]+}}_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
// CHECK-NEXT: br label %[[VAL_267:.*]]
// CHECK: else: ; preds = %[[VAL_266]]
// CHECK-NEXT: br label %[[VAL_267]]
@@ -277,795 +94,3 @@ int foo() {
// CHECK-NEXT: br label %[[VAL_283]]
// CHECK: ifcont8: ; preds = %[[VAL_272]], %[[VAL_271]]
// CHECK-NEXT: ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
-// CHECK-NEXT: entry:
-// CHECK-NEXT: %[[VAL_400:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[VAL_401:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_402:.*]] = alloca i32, align 4
-// CHECK-NEXT: %[[VAL_403:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK-NEXT: store ptr %[[VAL_404:.*]], ptr %[[VAL_400]], align 8
-// CHECK-NEXT: store i32 %[[VAL_405:.*]], ptr %[[VAL_401]], align 4
-// CHECK-NEXT: %[[VAL_406:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-NEXT: %[[VAL_407:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-NEXT: %[[VAL_408:.*]] = and i32 %[[VAL_407]], 31
-// CHECK-NEXT: %[[VAL_409:.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-NEXT: %[[VAL_410:.*]] = ashr i32 %[[VAL_409]], 5
-// CHECK-NEXT: %[[VAL_411:.*]] = load ptr, ptr %[[VAL_400]], align 8
-// CHECK-NEXT: store i32 0, ptr %[[VAL_402]], align 4
-// CHECK-NEXT: br label %[[VAL_412:.*]]
-// CHECK: precond: ; preds = %[[VAL_413:.*]], %[[VAL_414:.*]]
-// CHECK-NEXT: %[[VAL_415:.*]] = load i32, ptr %[[VAL_402]], align 4
-// CHECK-NEXT: %[[VAL_416:.*]] = icmp ult i32 %[[VAL_415]], 2
-// CHECK-NEXT: br i1 %[[VAL_416]], label %[[VAL_417:.*]], label %[[VAL_418:.*]]
-// CHECK: body: ; preds = %[[VAL_412]]
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @4, i32 %[[VAL_403]])
-// CHECK-NEXT: %[[VAL_419:.*]] = icmp eq i32 %[[VAL_408]], 0
-// CHECK-NEXT: br i1 %[[VAL_419]], label %[[VAL_420:.*]], label %[[VAL_421:.*]]
-// CHECK: then: ; preds = %[[VAL_417]]
-// CHECK-NEXT: %[[VAL_422:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_411]], i64 0, i64 0
-// CHECK-NEXT: %[[VAL_423:.*]] = load ptr, ptr %[[VAL_422]], align 8
-// CHECK-NEXT: %[[VAL_424:.*]] = getelementptr i32, ptr %[[VAL_423]], i32 %[[VAL_415]]
-// CHECK-NEXT: %[[VAL_425:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %[[VAL_410]]
-// CHECK-NEXT: %[[VAL_426:.*]] = load i32, ptr %[[VAL_424]], align 4
-// CHECK-NEXT: store volatile i32 %[[VAL_426]], ptr addrspace(3) %[[VAL_425]], align 4
-// CHECK-NEXT: br label %[[VAL_427:.*]]
-// CHECK: else: ; preds = %[[VAL_417]]
-// CHECK-NEXT: br label %[[VAL_427]]
-// CHECK: ifcont: ; preds = %[[VAL_421]], %[[VAL_420]]
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @4, i32 %[[VAL_403]])
-// CHECK-NEXT: %[[VAL_428:.*]] = load i32, ptr %[[VAL_401]], align 4
-// CHECK-NEXT: %[[VAL_429:.*]] = icmp ult i32 %[[VAL_406]], %[[VAL_428]]
-// CHECK-NEXT: br i1 %[[VAL_429]], label %[[VAL_430:.*]], label %[[VAL_431:.*]]
-// CHECK: then2: ; preds = %[[VAL_427]]
-// CHECK-NEXT: %[[VAL_432:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %[[VAL_406]]
-// CHECK-NEXT: %[[VAL_433:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_411]], i64 0, i64 0
-// CHECK-NEXT: %[[VAL_434:.*]] = load ptr, ptr %[[VAL_433]], align 8
-// CHECK-NEXT: %[[VAL_435:.*]] = getelementptr i32, ptr %[[VAL_434]], i32 %[[VAL_415]]
-// CHECK-NEXT: %[[VAL_436:.*]] = load volatile i32, ptr addrspace(3) %[[VAL_432]], align 4
-// CHECK-NEXT: store i32 %[[VAL_436]], ptr %[[VAL_435]], align 4
-// CHECK-NEXT: br label %[[VAL_413]]
-// CHECK: else3: ; preds = %[[VAL_427]]
-// CHECK-NEXT: br label %[[VAL_413]]
-// CHECK: ifcont4: ; preds = %[[VAL_431]], %[[VAL_430]]
-// CHECK-NEXT: %[[VAL_437:.*]] = add nsw i32 %[[VAL_415]], 1
-// CHECK-NEXT: store i32 %[[VAL_437]], ptr %[[VAL_402]], align 4
-// CHECK-NEXT: br label %[[VAL_412]]
-// CHECK: exit: ; preds = %[[VAL_412]]
-// CHECK-NEXT: ret void
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
-// CHECK1-NEXT: entry:
-// CHECK1-NEXT: %[[VAL_0:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_1:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_2:.*]] = alloca { float, float }, align 4
-// CHECK1-NEXT: %[[VAL_3:.*]] = alloca i64, align 8
-// CHECK1-NEXT: %[[VAL_4:.*]] = alloca [2 x ptr], align 8
-// CHECK1-NEXT: %[[VAL_5:.*]] = alloca [2 x ptr], align 8
-// CHECK1-NEXT: %[[VAL_6:.*]] = alloca [2 x ptr], align 8
-// CHECK1-NEXT: %[[VAL_7:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_8:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_9:.*]] = alloca %[[VAL_10:.*]], align 8
-// CHECK1-NEXT: %[[VAL_11:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_12:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 1
-// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_11]], align 4
-// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_12]], align 4
-// CHECK1-NEXT: %[[VAL_13:.*]] = load i32, ptr %[[VAL_1]], align 4
-// CHECK1-NEXT: store i32 %[[VAL_13]], ptr %[[VAL_3]], align 4
-// CHECK1-NEXT: %[[VAL_14:.*]] = load i64, ptr %[[VAL_3]], align 8
-// CHECK1-NEXT: %[[VAL_15:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
-// CHECK1-NEXT: store i64 %[[VAL_14]], ptr %[[VAL_15]], align 8
-// CHECK1-NEXT: %[[VAL_16:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
-// CHECK1-NEXT: store i64 %[[VAL_14]], ptr %[[VAL_16]], align 8
-// CHECK1-NEXT: %[[VAL_17:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i64 0, i64 0
-// CHECK1-NEXT: store ptr null, ptr %[[VAL_17]], align 8
-// CHECK1-NEXT: %[[VAL_18:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 1
-// CHECK1-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_18]], align 8
-// CHECK1-NEXT: %[[VAL_19:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 1
-// CHECK1-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_19]], align 8
-// CHECK1-NEXT: %[[VAL_20:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i64 0, i64 1
-// CHECK1-NEXT: store ptr null, ptr %[[VAL_20]], align 8
-// CHECK1-NEXT: %[[VAL_21:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_22:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_23:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 0
-// CHECK1-NEXT: store i32 2, ptr %[[VAL_23]], align 4
-// CHECK1-NEXT: %[[VAL_24:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 1
-// CHECK1-NEXT: store i32 2, ptr %[[VAL_24]], align 4
-// CHECK1-NEXT: %[[VAL_25:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 2
-// CHECK1-NEXT: store ptr %[[VAL_21]], ptr %[[VAL_25]], align 8
-// CHECK1-NEXT: %[[VAL_26:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 3
-// CHECK1-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_26]], align 8
-// CHECK1-NEXT: %[[VAL_27:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 4
-// CHECK1-NEXT: store ptr @.offload_sizes, ptr %[[VAL_27]], align 8
-// CHECK1-NEXT: %[[VAL_28:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 5
-// CHECK1-NEXT: store ptr @.offload_maptypes, ptr %[[VAL_28]], align 8
-// CHECK1-NEXT: %[[VAL_29:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 6
-// CHECK1-NEXT: store ptr null, ptr %[[VAL_29]], align 8
-// CHECK1-NEXT: %[[VAL_30:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 7
-// CHECK1-NEXT: store ptr null, ptr %[[VAL_30]], align 8
-// CHECK1-NEXT: %[[VAL_31:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 8
-// CHECK1-NEXT: store i64 100, ptr %[[VAL_31]], align 8
-// CHECK1-NEXT: %[[VAL_32:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 9
-// CHECK1-NEXT: store i64 0, ptr %[[VAL_32]], align 8
-// CHECK1-NEXT: %[[VAL_33:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 10
-// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_33]], align 4
-// CHECK1-NEXT: %[[VAL_34:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 11
-// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_34]], align 4
-// CHECK1-NEXT: %[[VAL_35:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 12
-// CHECK1-NEXT: store i32 0, ptr %[[VAL_35]], align 4
-// CHECK1-NEXT: %[[VAL_36:.*]] = call i32 @__tgt_target_kernel(ptr @4, i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.region_id, ptr %[[VAL_9]])
-// CHECK1-NEXT: %[[VAL_37:.*]] = icmp ne i32 %[[VAL_36]], 0
-// CHECK1-NEXT: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
-// CHECK1: omp_offload.failed: ; preds = %[[VAL_40:.*]]
-// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31(i64 %[[VAL_14]], ptr %[[VAL_2]]) #2
-// CHECK1-NEXT: br label %[[VAL_39]]
-// CHECK1: omp_offload.cont: ; preds = %[[VAL_38]], %[[VAL_40]]
-// CHECK1-NEXT: ret i32 0
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
-// CHECK1-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK1-NEXT: entry:
-// CHECK1-NEXT: %[[VAL_41:.*]] = alloca i64, align 8
-// CHECK1-NEXT: %[[VAL_42:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: %[[VAL_43:.*]] = alloca i64, align 8
-// CHECK1-NEXT: store i64 %[[VAL_44:.*]], ptr %[[VAL_41]], align 8
-// CHECK1-NEXT: store ptr %[[VAL_45:.*]], ptr %[[VAL_42]], align 8
-// CHECK1-NEXT: %[[VAL_46:.*]] = load ptr, ptr %[[VAL_42]], align 8
-// CHECK1-NEXT: %[[VAL_47:.*]] = load i32, ptr %[[VAL_41]], align 4
-// CHECK1-NEXT: store i32 %[[VAL_47]], ptr %[[VAL_43]], align 4
-// CHECK1-NEXT: %[[VAL_48:.*]] = load i64, ptr %[[VAL_43]], align 8
-// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @4, i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined, i64 %[[VAL_48]], ptr %[[VAL_46]])
-// CHECK1-NEXT: ret void
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK1-NEXT: entry:
-// CHECK1-NEXT: %[[VAL_49:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: %[[VAL_50:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: %[[VAL_51:.*]] = alloca i64, align 8
-// CHECK1-NEXT: %[[VAL_52:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: %[[VAL_53:.*]] = alloca { float, float }, align 4
-// CHECK1-NEXT: %[[VAL_54:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_55:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_56:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_57:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_58:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_59:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_60:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_61:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_62:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_63:.*]] = alloca i32, align 4
-// CHECK1-NEXT: %[[VAL_64:.*]] = alloca i64, align 8
-// CHECK1-NEXT: %[[VAL_65:.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT: %[[VAL_66:.*]] = alloca { float, float }, align 4
-// CHECK1-NEXT: %[[VAL_67:.*]] = alloca { float, float }, align 4
-// CHECK1-NEXT: %[[VAL_68:.*]] = alloca { float, float }, align 4
-// CHECK1-NEXT: store ptr %[[VAL_69:.*]], ptr %[[VAL_49]], align 8
-// CHECK1-NEXT: store ptr %[[VAL_70:.*]], ptr %[[VAL_50]], align 8
-// CHECK1-NEXT: store i64 %[[VAL_71:.*]], ptr %[[VAL_51]], align 8
-// CHECK1-NEXT: store ptr %[[VAL_72:.*]], ptr %[[VAL_52]], align 8
-// CHECK1-NEXT: %[[VAL_73:.*]] = load ptr, ptr %[[VAL_52]], align 8
-// CHECK1-NEXT: %[[VAL_74:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_75:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_74]], align 4
-// CHECK1-NEXT: store float 0.000000e+00, ptr %[[VAL_75]], align 4
-// CHECK1-NEXT: store i32 0, ptr %[[VAL_57]], align 4
-// CHECK1-NEXT: store i32 99, ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: store i32 1, ptr %[[VAL_59]], align 4
-// CHECK1-NEXT: store i32 0, ptr %[[VAL_60]], align 4
-// CHECK1-NEXT: %[[VAL_76:.*]] = load ptr, ptr %[[VAL_49]], align 8
-// CHECK1-NEXT: %[[VAL_77:.*]] = load i32, ptr %[[VAL_76]], align 4
-// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @1, i32 %[[VAL_77]], i32 92, ptr %[[VAL_60]], ptr %[[VAL_57]], ptr %[[VAL_58]], ptr %[[VAL_59]], i32 1, i32 1)
-// CHECK1-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: %[[VAL_79:.*]] = icmp sgt i32 %[[VAL_78]], 99
-// CHECK1-NEXT: br i1 %[[VAL_79]], label %[[VAL_80:.*]], label %[[VAL_81:.*]]
-// CHECK1: cond.true: ; preds = %[[VAL_82:.*]]
-// CHECK1-NEXT: br label %[[VAL_83:.*]]
-// CHECK1: cond.false: ; preds = %[[VAL_82]]
-// CHECK1-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: br label %[[VAL_83]]
-// CHECK1: cond.end: ; preds = %[[VAL_81]], %[[VAL_80]]
-// CHECK1-NEXT: %[[VAL_85:.*]] = phi i32 [ 99, %[[VAL_80]] ], [ %[[VAL_84]], %[[VAL_81]] ]
-// CHECK1-NEXT: store i32 %[[VAL_85]], ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: %[[VAL_86:.*]] = load i32, ptr %[[VAL_57]], align 4
-// CHECK1-NEXT: store i32 %[[VAL_86]], ptr %[[VAL_54]], align 4
-// CHECK1-NEXT: br label %[[VAL_87:.*]]
-// CHECK1: omp.inner.for.cond: ; preds = %[[VAL_88:.*]], %[[VAL_83]]
-// CHECK1-NEXT: %[[VAL_89:.*]] = load i32, ptr %[[VAL_54]], align 4
-// CHECK1-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: %[[VAL_91:.*]] = icmp sle i32 %[[VAL_89]], %[[VAL_90]]
-// CHECK1-NEXT: br i1 %[[VAL_91]], label %[[VAL_92:.*]], label %[[VAL_93:.*]]
-// CHECK1: omp.inner.for.body: ; preds = %[[VAL_87]]
-// CHECK1-NEXT: %[[VAL_94:.*]] = load i32, ptr %[[VAL_57]], align 4
-// CHECK1-NEXT: %[[VAL_95:.*]] = zext i32 %[[VAL_94]] to i64
-// CHECK1-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK1-NEXT: %[[VAL_97:.*]] = zext i32 %[[VAL_96]] to i64
-// CHECK1-NEXT: %[[VAL_98:.*]] = load i32, ptr %[[VAL_61]], align 4
-// CHECK1-NEXT: store i32 %[[VAL_98]], ptr %[[VAL_64]], align 4
-// CHECK1-NEXT: %[[VAL_99:.*]] = load i64, ptr %[[VAL_64]], align 8
-// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @4, i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined, i64 %[[VAL_95]], i64 %[[VAL_97]], i64 %[[VAL_99]], ptr %[[VAL_53]])
-// CHECK1-NEXT: br label %[[VAL_88]]
-// CHECK1: omp.inner.for.inc: ; preds = %[[VAL_92]]
-// CHECK1-NEXT: %[[VAL_100:.*]] = load i32, ptr %[[VAL_54]], align 4
-// CHECK1-NEXT: %[[VAL_101:.*]] = load i32, ptr %[[VAL_59]], align 4
-// CHECK1-NEXT: %[[VAL_102:.*]] = add nsw i32 %[[VAL_100]], %[[VAL_101]]
-// CHECK1-NEXT: store i32 %[[VAL_102]], ptr %[[VAL_54]], align 4
-// CHECK1-NEXT: br label %[[VAL_87]]
-// CHECK1: omp.inner.for.end: ; preds = %[[VAL_87]]
-// CHECK1-NEXT: br label %[[VAL_103:.*]]
-// CHECK1: omp.loop.exit: ; preds = %[[VAL_93]]
-// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_77]])
-// CHECK1-NEXT: %[[VAL_104:.*]] = load i32, ptr %[[VAL_60]], align 4
-// CHECK1-NEXT: %[[VAL_105:.*]] = icmp ne i32 %[[VAL_104]], 0
-// CHECK1-NEXT: br i1 %[[VAL_105]], label %[[VAL_106:.*]], label %[[VAL_107:.*]]
-// CHECK1: .omp.lastprivate.then: ; preds = %[[VAL_103]]
-// CHECK1-NEXT: store i32 10, ptr %[[VAL_61]], align 4
-// CHECK1-NEXT: %[[VAL_108:.*]] = load i32, ptr %[[VAL_61]], align 4
-// CHECK1-NEXT: store i32 %[[VAL_108]], ptr %[[VAL_51]], align 4
-// CHECK1-NEXT: br label %[[VAL_107]]
-// CHECK1: .omp.lastprivate.done: ; preds = %[[VAL_106]], %[[VAL_103]]
-// CHECK1-NEXT: %[[VAL_109:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_65]], i64 0, i64 0
-// CHECK1-NEXT: store ptr %[[VAL_53]], ptr %[[VAL_109]], align 8
-// CHECK1-NEXT: %[[VAL_110:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_77]], i32 1, i64 8, ptr %[[VAL_65]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
-// CHECK1-NEXT: switch i32 %[[VAL_110]], label %[[VAL_111:.*]] [
-// CHECK1-NEXT: i32 1, label %[[VAL_112:.*]]
-// CHECK1-NEXT: i32 2, label %[[VAL_113:.*]]
-// CHECK1-NEXT: ]
-// CHECK1: .omp.reduction.case1: ; preds = %[[VAL_107]]
-// CHECK1-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_115:.*]] = load float, ptr %[[VAL_114]], align 4
-// CHECK1-NEXT: %[[VAL_116:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_117:.*]] = load float, ptr %[[VAL_116]], align 4
-// CHECK1-NEXT: %[[VAL_118:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_119:.*]] = load float, ptr %[[VAL_118]], align 4
-// CHECK1-NEXT: %[[VAL_120:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_121:.*]] = load float, ptr %[[VAL_120]], align 4
-// CHECK1-NEXT: %[[VAL_122:.*]] = fadd float %[[VAL_115]], %[[VAL_119]]
-// CHECK1-NEXT: %[[VAL_123:.*]] = fadd float %[[VAL_117]], %[[VAL_121]]
-// CHECK1-NEXT: %[[VAL_124:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_125:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
-// CHECK1-NEXT: store float %[[VAL_122]], ptr %[[VAL_124]], align 4
-// CHECK1-NEXT: store float %[[VAL_123]], ptr %[[VAL_125]], align 4
-// CHECK1-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK1-NEXT: br label %[[VAL_111]]
-// CHECK1: .omp.reduction.case2: ; preds = %[[VAL_107]]
-// CHECK1-NEXT: %[[VAL_126:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_127:.*]] = load float, ptr %[[VAL_126]], align 4
-// CHECK1-NEXT: %[[VAL_128:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_129:.*]] = load float, ptr %[[VAL_128]], align 4
-// CHECK1-NEXT: call void @__atomic_load(i64 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], i32 noundef signext 0)
-// CHECK1-NEXT: br label %[[VAL_130:.*]]
-// CHECK1: atomic_cont: ; preds = %[[VAL_130]], %[[VAL_113]]
-// CHECK1-NEXT: %[[VAL_131:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_132:.*]] = load float, ptr %[[VAL_131]], align 4
-// CHECK1-NEXT: %[[VAL_133:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_134:.*]] = load float, ptr %[[VAL_133]], align 4
-// CHECK1-NEXT: %[[VAL_135:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_136:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
-// CHECK1-NEXT: store float %[[VAL_132]], ptr %[[VAL_135]], align 4
-// CHECK1-NEXT: store float %[[VAL_134]], ptr %[[VAL_136]], align 4
-// CHECK1-NEXT: %[[VAL_137:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4
-// CHECK1-NEXT: %[[VAL_139:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_140:.*]] = load float, ptr %[[VAL_139]], align 4
-// CHECK1-NEXT: %[[VAL_141:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_142:.*]] = load float, ptr %[[VAL_141]], align 4
-// CHECK1-NEXT: %[[VAL_143:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_144:.*]] = load float, ptr %[[VAL_143]], align 4
-// CHECK1-NEXT: %[[VAL_145:.*]] = fadd float %[[VAL_138]], %[[VAL_142]]
-// CHECK1-NEXT: %[[VAL_146:.*]] = fadd float %[[VAL_140]], %[[VAL_144]]
-// CHECK1-NEXT: %[[VAL_147:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_148:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 1
-// CHECK1-NEXT: store float %[[VAL_145]], ptr %[[VAL_147]], align 4
-// CHECK1-NEXT: store float %[[VAL_146]], ptr %[[VAL_148]], align 4
-// CHECK1-NEXT: %[[VAL_149:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], ptr noundef %[[VAL_67]], i32 noundef signext 0, i32 noundef signext 0)
-// CHECK1-NEXT: br i1 %[[VAL_149]], label %[[VAL_150:.*]], label %[[VAL_130]]
-// CHECK1: atomic_exit: ; preds = %[[VAL_130]]
-// CHECK1-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK1-NEXT: br label %[[VAL_111]]
-// CHECK1: .omp.reduction.default: ; preds = %[[VAL_150]], %[[VAL_112]], %[[VAL_107]]
-// CHECK1-NEXT: ret void
-//
-//
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK1-NEXT: entry:
-// CHECK1-NEXT: %[[VAL_297:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: %[[VAL_298:.*]] = alloca ptr, align 8
-// CHECK1-NEXT: store ptr %[[VAL_299:.*]], ptr %[[VAL_297]], align 8
-// CHECK1-NEXT: store ptr %[[VAL_300:.*]], ptr %[[VAL_298]], align 8
-// CHECK1-NEXT: %[[VAL_301:.*]] = load ptr, ptr %[[VAL_297]], align 8
-// CHECK1-NEXT: %[[VAL_302:.*]] = load ptr, ptr %[[VAL_298]], align 8
-// CHECK1-NEXT: %[[VAL_303:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_302]], i64 0, i64 0
-// CHECK1-NEXT: %[[VAL_304:.*]] = load ptr, ptr %[[VAL_303]], align 8
-// CHECK1-NEXT: %[[VAL_305:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_301]], i64 0, i64 0
-// CHECK1-NEXT: %[[VAL_306:.*]] = load ptr, ptr %[[VAL_305]], align 8
-// CHECK1-NEXT: %[[VAL_307:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_308:.*]] = load float, ptr %[[VAL_307]], align 4
-// CHECK1-NEXT: %[[VAL_309:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_310:.*]] = load float, ptr %[[VAL_309]], align 4
-// CHECK1-NEXT: %[[VAL_311:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_304]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_312:.*]] = load float, ptr %[[VAL_311]], align 4
-// CHECK1-NEXT: %[[VAL_313:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_304]], i32 0, i32 1
-// CHECK1-NEXT: %[[VAL_314:.*]] = load float, ptr %[[VAL_313]], align 4
-// CHECK1-NEXT: %[[VAL_315:.*]] = fadd float %[[VAL_308]], %[[VAL_312]]
-// CHECK1-NEXT: %[[VAL_316:.*]] = fadd float %[[VAL_310]], %[[VAL_314]]
-// CHECK1-NEXT: %[[VAL_317:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 0
-// CHECK1-NEXT: %[[VAL_318:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_306]], i32 0, i32 1
-// CHECK1-NEXT: store float %[[VAL_315]], ptr %[[VAL_317]], align 4
-// CHECK1-NEXT: store float %[[VAL_316]], ptr %[[VAL_318]], align 4
-// CHECK1-NEXT: ret void
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@_Z3foov
-// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_0:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_1:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_2:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_3:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_4:.*]] = alloca [2 x ptr], align 4
-// CHECK2-NEXT: %[[VAL_5:.*]] = alloca [2 x ptr], align 4
-// CHECK2-NEXT: %[[VAL_6:.*]] = alloca [2 x ptr], align 4
-// CHECK2-NEXT: %[[VAL_7:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_8:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_9:.*]] = alloca %[[VAL_10:.*]], align 8
-// CHECK2-NEXT: %[[VAL_11:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_12:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_2]], i32 0, i32 1
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_11]], align 4
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_12]], align 4
-// CHECK2-NEXT: %[[VAL_13:.*]] = load i32, ptr %[[VAL_1]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_13]], ptr %[[VAL_3]], align 4
-// CHECK2-NEXT: %[[VAL_14:.*]] = load i32, ptr %[[VAL_3]], align 4
-// CHECK2-NEXT: %[[VAL_15:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
-// CHECK2-NEXT: store i32 %[[VAL_14]], ptr %[[VAL_15]], align 4
-// CHECK2-NEXT: %[[VAL_16:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
-// CHECK2-NEXT: store i32 %[[VAL_14]], ptr %[[VAL_16]], align 4
-// CHECK2-NEXT: %[[VAL_17:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i32 0, i32 0
-// CHECK2-NEXT: store ptr null, ptr %[[VAL_17]], align 4
-// CHECK2-NEXT: %[[VAL_18:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 1
-// CHECK2-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_18]], align 4
-// CHECK2-NEXT: %[[VAL_19:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 1
-// CHECK2-NEXT: store ptr %[[VAL_2]], ptr %[[VAL_19]], align 4
-// CHECK2-NEXT: %[[VAL_20:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_6]], i32 0, i32 1
-// CHECK2-NEXT: store ptr null, ptr %[[VAL_20]], align 4
-// CHECK2-NEXT: %[[VAL_21:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_22:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_23:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 0
-// CHECK2-NEXT: store i32 2, ptr %[[VAL_23]], align 4
-// CHECK2-NEXT: %[[VAL_24:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 1
-// CHECK2-NEXT: store i32 2, ptr %[[VAL_24]], align 4
-// CHECK2-NEXT: %[[VAL_25:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 2
-// CHECK2-NEXT: store ptr %[[VAL_21]], ptr %[[VAL_25]], align 4
-// CHECK2-NEXT: %[[VAL_26:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 3
-// CHECK2-NEXT: store ptr %[[VAL_22]], ptr %[[VAL_26]], align 4
-// CHECK2-NEXT: %[[VAL_27:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 4
-// CHECK2-NEXT: store ptr @.offload_sizes, ptr %[[VAL_27]], align 4
-// CHECK2-NEXT: %[[VAL_28:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 5
-// CHECK2-NEXT: store ptr @.offload_maptypes, ptr %[[VAL_28]], align 4
-// CHECK2-NEXT: %[[VAL_29:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 6
-// CHECK2-NEXT: store ptr null, ptr %[[VAL_29]], align 4
-// CHECK2-NEXT: %[[VAL_30:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 7
-// CHECK2-NEXT: store ptr null, ptr %[[VAL_30]], align 4
-// CHECK2-NEXT: %[[VAL_31:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 8
-// CHECK2-NEXT: store i64 100, ptr %[[VAL_31]], align 8
-// CHECK2-NEXT: %[[VAL_32:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 9
-// CHECK2-NEXT: store i64 0, ptr %[[VAL_32]], align 8
-// CHECK2-NEXT: %[[VAL_33:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 10
-// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_33]], align 4
-// CHECK2-NEXT: %[[VAL_34:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 11
-// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr %[[VAL_34]], align 4
-// CHECK2-NEXT: %[[VAL_35:.*]] = getelementptr inbounds %[[VAL_10]], ptr %[[VAL_9]], i32 0, i32 12
-// CHECK2-NEXT: store i32 0, ptr %[[VAL_35]], align 4
-// CHECK2-NEXT: %[[VAL_36:.*]] = call i32 @__tgt_target_kernel(ptr @4, i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.region_id, ptr %[[VAL_9]])
-// CHECK2-NEXT: %[[VAL_37:.*]] = icmp ne i32 %[[VAL_36]], 0
-// CHECK2-NEXT: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
-// CHECK2: omp_offload.failed: ; preds = %[[VAL_40:.*]]
-// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31(i32 %[[VAL_14]], ptr %[[VAL_2]]) #2
-// CHECK2-NEXT: br label %[[VAL_39]]
-// CHECK2: omp_offload.cont: ; preds = %[[VAL_38]], %[[VAL_40]]
-// CHECK2-NEXT: ret i32 0
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31
-// CHECK2-SAME: (i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_41:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_42:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_43:.*]] = alloca i32, align 4
-// CHECK2-NEXT: store i32 %[[VAL_44:.*]], ptr %[[VAL_41]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_45:.*]], ptr %[[VAL_42]], align 4
-// CHECK2-NEXT: %[[VAL_46:.*]] = load ptr, ptr %[[VAL_42]], align 4
-// CHECK2-NEXT: %[[VAL_47:.*]] = load i32, ptr %[[VAL_41]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_47]], ptr %[[VAL_43]], align 4
-// CHECK2-NEXT: %[[VAL_48:.*]] = load i32, ptr %[[VAL_43]], align 4
-// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @4, i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined, i32 %[[VAL_48]], ptr %[[VAL_46]])
-// CHECK2-NEXT: ret void
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_49:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_50:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_51:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_52:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_53:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_54:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_55:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_56:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_57:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_58:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_59:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_60:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_61:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_62:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_63:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_64:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_65:.*]] = alloca [1 x ptr], align 4
-// CHECK2-NEXT: %[[VAL_66:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_67:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_68:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: store ptr %[[VAL_69:.*]], ptr %[[VAL_49]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_70:.*]], ptr %[[VAL_50]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_71:.*]], ptr %[[VAL_51]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_72:.*]], ptr %[[VAL_52]], align 4
-// CHECK2-NEXT: %[[VAL_73:.*]] = load ptr, ptr %[[VAL_52]], align 4
-// CHECK2-NEXT: %[[VAL_74:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_75:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_74]], align 4
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_75]], align 4
-// CHECK2-NEXT: store i32 0, ptr %[[VAL_57]], align 4
-// CHECK2-NEXT: store i32 99, ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: store i32 1, ptr %[[VAL_59]], align 4
-// CHECK2-NEXT: store i32 0, ptr %[[VAL_60]], align 4
-// CHECK2-NEXT: %[[VAL_76:.*]] = load ptr, ptr %[[VAL_49]], align 4
-// CHECK2-NEXT: %[[VAL_77:.*]] = load i32, ptr %[[VAL_76]], align 4
-// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @1, i32 %[[VAL_77]], i32 92, ptr %[[VAL_60]], ptr %[[VAL_57]], ptr %[[VAL_58]], ptr %[[VAL_59]], i32 1, i32 1)
-// CHECK2-NEXT: %[[VAL_78:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: %[[VAL_79:.*]] = icmp sgt i32 %[[VAL_78]], 99
-// CHECK2-NEXT: br i1 %[[VAL_79]], label %[[VAL_80:.*]], label %[[VAL_81:.*]]
-// CHECK2: cond.true: ; preds = %[[VAL_82:.*]]
-// CHECK2-NEXT: br label %[[VAL_83:.*]]
-// CHECK2: cond.false: ; preds = %[[VAL_82]]
-// CHECK2-NEXT: %[[VAL_84:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: br label %[[VAL_83]]
-// CHECK2: cond.end: ; preds = %[[VAL_81]], %[[VAL_80]]
-// CHECK2-NEXT: %[[VAL_85:.*]] = phi i32 [ 99, %[[VAL_80]] ], [ %[[VAL_84]], %[[VAL_81]] ]
-// CHECK2-NEXT: store i32 %[[VAL_85]], ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: %[[VAL_86:.*]] = load i32, ptr %[[VAL_57]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_86]], ptr %[[VAL_54]], align 4
-// CHECK2-NEXT: br label %[[VAL_87:.*]]
-// CHECK2: omp.inner.for.cond: ; preds = %[[VAL_88:.*]], %[[VAL_83]]
-// CHECK2-NEXT: %[[VAL_89:.*]] = load i32, ptr %[[VAL_54]], align 4
-// CHECK2-NEXT: %[[VAL_90:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: %[[VAL_91:.*]] = icmp sle i32 %[[VAL_89]], %[[VAL_90]]
-// CHECK2-NEXT: br i1 %[[VAL_91]], label %[[VAL_92:.*]], label %[[VAL_93:.*]]
-// CHECK2: omp.inner.for.body: ; preds = %[[VAL_87]]
-// CHECK2-NEXT: %[[VAL_94:.*]] = load i32, ptr %[[VAL_57]], align 4
-// CHECK2-NEXT: %[[VAL_95:.*]] = load i32, ptr %[[VAL_58]], align 4
-// CHECK2-NEXT: %[[VAL_96:.*]] = load i32, ptr %[[VAL_61]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_96]], ptr %[[VAL_64]], align 4
-// CHECK2-NEXT: %[[VAL_97:.*]] = load i32, ptr %[[VAL_64]], align 4
-// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @4, i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined, i32 %[[VAL_94]], i32 %[[VAL_95]], i32 %[[VAL_97]], ptr %[[VAL_53]])
-// CHECK2-NEXT: br label %[[VAL_88]]
-// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_92]]
-// CHECK2-NEXT: %[[VAL_98:.*]] = load i32, ptr %[[VAL_54]], align 4
-// CHECK2-NEXT: %[[VAL_99:.*]] = load i32, ptr %[[VAL_59]], align 4
-// CHECK2-NEXT: %[[VAL_100:.*]] = add nsw i32 %[[VAL_98]], %[[VAL_99]]
-// CHECK2-NEXT: store i32 %[[VAL_100]], ptr %[[VAL_54]], align 4
-// CHECK2-NEXT: br label %[[VAL_87]]
-// CHECK2: omp.inner.for.end: ; preds = %[[VAL_87]]
-// CHECK2-NEXT: br label %[[VAL_101:.*]]
-// CHECK2: omp.loop.exit: ; preds = %[[VAL_93]]
-// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_77]])
-// CHECK2-NEXT: %[[VAL_102:.*]] = load i32, ptr %[[VAL_60]], align 4
-// CHECK2-NEXT: %[[VAL_103:.*]] = icmp ne i32 %[[VAL_102]], 0
-// CHECK2-NEXT: br i1 %[[VAL_103]], label %[[VAL_104:.*]], label %[[VAL_105:.*]]
-// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_101]]
-// CHECK2-NEXT: store i32 10, ptr %[[VAL_61]], align 4
-// CHECK2-NEXT: %[[VAL_106:.*]] = load i32, ptr %[[VAL_61]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_106]], ptr %[[VAL_51]], align 4
-// CHECK2-NEXT: br label %[[VAL_105]]
-// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_104]], %[[VAL_101]]
-// CHECK2-NEXT: %[[VAL_107:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_65]], i32 0, i32 0
-// CHECK2-NEXT: store ptr %[[VAL_53]], ptr %[[VAL_107]], align 4
-// CHECK2-NEXT: %[[VAL_108:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_77]], i32 1, i32 4, ptr %[[VAL_65]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: switch i32 %[[VAL_108]], label %[[VAL_109:.*]] [
-// CHECK2-NEXT: i32 1, label %[[VAL_110:.*]]
-// CHECK2-NEXT: i32 2, label %[[VAL_111:.*]]
-// CHECK2-NEXT: ]
-// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_105]]
-// CHECK2-NEXT: %[[VAL_112:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_113:.*]] = load float, ptr %[[VAL_112]], align 4
-// CHECK2-NEXT: %[[VAL_114:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_115:.*]] = load float, ptr %[[VAL_114]], align 4
-// CHECK2-NEXT: %[[VAL_116:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_117:.*]] = load float, ptr %[[VAL_116]], align 4
-// CHECK2-NEXT: %[[VAL_118:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_119:.*]] = load float, ptr %[[VAL_118]], align 4
-// CHECK2-NEXT: %[[VAL_120:.*]] = fadd float %[[VAL_113]], %[[VAL_117]]
-// CHECK2-NEXT: %[[VAL_121:.*]] = fadd float %[[VAL_115]], %[[VAL_119]]
-// CHECK2-NEXT: %[[VAL_122:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_123:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_73]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_120]], ptr %[[VAL_122]], align 4
-// CHECK2-NEXT: store float %[[VAL_121]], ptr %[[VAL_123]], align 4
-// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_109]]
-// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_105]]
-// CHECK2-NEXT: %[[VAL_124:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_125:.*]] = load float, ptr %[[VAL_124]], align 4
-// CHECK2-NEXT: %[[VAL_126:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_127:.*]] = load float, ptr %[[VAL_126]], align 4
-// CHECK2-NEXT: call void @__atomic_load(i32 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], i32 noundef 0)
-// CHECK2-NEXT: br label %[[VAL_128:.*]]
-// CHECK2: atomic_cont: ; preds = %[[VAL_128]], %[[VAL_111]]
-// CHECK2-NEXT: %[[VAL_129:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_130:.*]] = load float, ptr %[[VAL_129]], align 4
-// CHECK2-NEXT: %[[VAL_131:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_66]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_132:.*]] = load float, ptr %[[VAL_131]], align 4
-// CHECK2-NEXT: %[[VAL_133:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_134:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_130]], ptr %[[VAL_133]], align 4
-// CHECK2-NEXT: store float %[[VAL_132]], ptr %[[VAL_134]], align 4
-// CHECK2-NEXT: %[[VAL_135:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_136:.*]] = load float, ptr %[[VAL_135]], align 4
-// CHECK2-NEXT: %[[VAL_137:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_68]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4
-// CHECK2-NEXT: %[[VAL_139:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_140:.*]] = load float, ptr %[[VAL_139]], align 4
-// CHECK2-NEXT: %[[VAL_141:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_53]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_142:.*]] = load float, ptr %[[VAL_141]], align 4
-// CHECK2-NEXT: %[[VAL_143:.*]] = fadd float %[[VAL_136]], %[[VAL_140]]
-// CHECK2-NEXT: %[[VAL_144:.*]] = fadd float %[[VAL_138]], %[[VAL_142]]
-// CHECK2-NEXT: %[[VAL_145:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_146:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_67]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_143]], ptr %[[VAL_145]], align 4
-// CHECK2-NEXT: store float %[[VAL_144]], ptr %[[VAL_146]], align 4
-// CHECK2-NEXT: %[[VAL_147:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_73]], ptr noundef %[[VAL_66]], ptr noundef %[[VAL_67]], i32 noundef 0, i32 noundef 0)
-// CHECK2-NEXT: br i1 %[[VAL_147]], label %[[VAL_148:.*]], label %[[VAL_128]]
-// CHECK2: atomic_exit: ; preds = %[[VAL_128]]
-// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_77]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_109]]
-// CHECK2: .omp.reduction.default: ; preds = %[[VAL_148]], %[[VAL_110]], %[[VAL_105]]
-// CHECK2-NEXT: ret void
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_149:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_150:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_151:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_152:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_153:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_154:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_155:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_156:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_157:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_158:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_159:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_160:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_161:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_162:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_163:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_164:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_165:.*]] = alloca i32, align 4
-// CHECK2-NEXT: %[[VAL_166:.*]] = alloca [1 x ptr], align 4
-// CHECK2-NEXT: %[[VAL_167:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_168:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: %[[VAL_169:.*]] = alloca { float, float }, align 4
-// CHECK2-NEXT: store ptr %[[VAL_170:.*]], ptr %[[VAL_149]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_171:.*]], ptr %[[VAL_150]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_172:.*]], ptr %[[VAL_151]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_173:.*]], ptr %[[VAL_152]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_174:.*]], ptr %[[VAL_153]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_175:.*]], ptr %[[VAL_154]], align 4
-// CHECK2-NEXT: %[[VAL_176:.*]] = load ptr, ptr %[[VAL_154]], align 4
-// CHECK2-NEXT: store i32 0, ptr %[[VAL_158]], align 4
-// CHECK2-NEXT: store i32 99, ptr %[[VAL_159]], align 4
-// CHECK2-NEXT: %[[VAL_177:.*]] = load i32, ptr %[[VAL_151]], align 4
-// CHECK2-NEXT: %[[VAL_178:.*]] = load i32, ptr %[[VAL_152]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_177]], ptr %[[VAL_158]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_178]], ptr %[[VAL_159]], align 4
-// CHECK2-NEXT: store i32 1, ptr %[[VAL_160]], align 4
-// CHECK2-NEXT: store i32 0, ptr %[[VAL_161]], align 4
-// CHECK2-NEXT: %[[VAL_179:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_180:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_179]], align 4
-// CHECK2-NEXT: store float 0.000000e+00, ptr %[[VAL_180]], align 4
-// CHECK2-NEXT: %[[VAL_181:.*]] = load ptr, ptr %[[VAL_149]], align 4
-// CHECK2-NEXT: %[[VAL_182:.*]] = load i32, ptr %[[VAL_181]], align 4
-// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @2, i32 %[[VAL_182]], i32 34, ptr %[[VAL_161]], ptr %[[VAL_158]], ptr %[[VAL_159]], ptr %[[VAL_160]], i32 1, i32 1)
-// CHECK2-NEXT: %[[VAL_183:.*]] = load i32, ptr %[[VAL_159]], align 4
-// CHECK2-NEXT: %[[VAL_184:.*]] = icmp sgt i32 %[[VAL_183]], 99
-// CHECK2-NEXT: br i1 %[[VAL_184]], label %[[VAL_185:.*]], label %[[VAL_186:.*]]
-// CHECK2: cond.true: ; preds = %[[VAL_187:.*]]
-// CHECK2-NEXT: br label %[[VAL_188:.*]]
-// CHECK2: cond.false: ; preds = %[[VAL_187]]
-// CHECK2-NEXT: %[[VAL_189:.*]] = load i32, ptr %[[VAL_159]], align 4
-// CHECK2-NEXT: br label %[[VAL_188]]
-// CHECK2: cond.end: ; preds = %[[VAL_186]], %[[VAL_185]]
-// CHECK2-NEXT: %[[VAL_190:.*]] = phi i32 [ 99, %[[VAL_185]] ], [ %[[VAL_189]], %[[VAL_186]] ]
-// CHECK2-NEXT: store i32 %[[VAL_190]], ptr %[[VAL_159]], align 4
-// CHECK2-NEXT: %[[VAL_191:.*]] = load i32, ptr %[[VAL_158]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_191]], ptr %[[VAL_155]], align 4
-// CHECK2-NEXT: br label %[[VAL_192:.*]]
-// CHECK2: omp.inner.for.cond: ; preds = %[[VAL_193:.*]], %[[VAL_188]]
-// CHECK2-NEXT: %[[VAL_194:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_195:.*]] = load i32, ptr %[[VAL_159]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_196:.*]] = icmp sle i32 %[[VAL_194]], %[[VAL_195]]
-// CHECK2-NEXT: br i1 %[[VAL_196]], label %[[VAL_197:.*]], label %[[VAL_198:.*]]
-// CHECK2: omp.inner.for.body: ; preds = %[[VAL_192]]
-// CHECK2-NEXT: %[[VAL_199:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_200:.*]] = sdiv i32 %[[VAL_199]], 10
-// CHECK2-NEXT: %[[VAL_201:.*]] = mul nsw i32 %[[VAL_200]], 1
-// CHECK2-NEXT: %[[VAL_202:.*]] = add nsw i32 0, %[[VAL_201]]
-// CHECK2-NEXT: store i32 %[[VAL_202]], ptr %[[VAL_164]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_203:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_204:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_205:.*]] = sdiv i32 %[[VAL_204]], 10
-// CHECK2-NEXT: %[[VAL_206:.*]] = mul nsw i32 %[[VAL_205]], 10
-// CHECK2-NEXT: %[[VAL_207:.*]] = sub nsw i32 %[[VAL_203]], %[[VAL_206]]
-// CHECK2-NEXT: %[[VAL_208:.*]] = mul nsw i32 %[[VAL_207]], 1
-// CHECK2-NEXT: %[[VAL_209:.*]] = add nsw i32 0, %[[VAL_208]]
-// CHECK2-NEXT: store i32 %[[VAL_209]], ptr %[[VAL_162]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_210:.*]] = load i32, ptr %[[VAL_164]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_211:.*]] = sitofp i32 %[[VAL_210]] to float
-// CHECK2-NEXT: %[[VAL_212:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_213:.*]] = load float, ptr %[[VAL_212]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_214:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_215:.*]] = load float, ptr %[[VAL_214]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_216:.*]] = fadd float %[[VAL_213]], %[[VAL_211]]
-// CHECK2-NEXT: %[[VAL_217:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_218:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_216]], ptr %[[VAL_217]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: store float %[[VAL_215]], ptr %[[VAL_218]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: br label %[[VAL_219:.*]]
-// CHECK2: omp.body.continue: ; preds = %[[VAL_197]]
-// CHECK2-NEXT: br label %[[VAL_193]]
-// CHECK2: omp.inner.for.inc: ; preds = %[[VAL_219]]
-// CHECK2-NEXT: %[[VAL_220:.*]] = load i32, ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: %[[VAL_221:.*]] = add nsw i32 %[[VAL_220]], 1
-// CHECK2-NEXT: store i32 %[[VAL_221]], ptr %[[VAL_155]], align 4, !llvm.access.group !5
-// CHECK2-NEXT: br label %[[VAL_192]], !llvm.loop !6
-// CHECK2: omp.inner.for.end: ; preds = %[[VAL_192]]
-// CHECK2-NEXT: br label %[[VAL_222:.*]]
-// CHECK2: omp.loop.exit: ; preds = %[[VAL_198]]
-// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @2, i32 %[[VAL_182]])
-// CHECK2-NEXT: %[[VAL_223:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_166]], i32 0, i32 0
-// CHECK2-NEXT: store ptr %[[VAL_163]], ptr %[[VAL_223]], align 4
-// CHECK2-NEXT: %[[VAL_224:.*]] = call i32 @__kmpc_reduce(ptr @3, i32 %[[VAL_182]], i32 1, i32 4, ptr %[[VAL_166]], ptr @__omp_offloading_1030b_4868a89__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: switch i32 %[[VAL_224]], label %[[VAL_225:.*]] [
-// CHECK2-NEXT: i32 1, label %[[VAL_226:.*]]
-// CHECK2-NEXT: i32 2, label %[[VAL_227:.*]]
-// CHECK2-NEXT: ]
-// CHECK2: .omp.reduction.case1: ; preds = %[[VAL_222]]
-// CHECK2-NEXT: %[[VAL_228:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_229:.*]] = load float, ptr %[[VAL_228]], align 4
-// CHECK2-NEXT: %[[VAL_230:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_231:.*]] = load float, ptr %[[VAL_230]], align 4
-// CHECK2-NEXT: %[[VAL_232:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_233:.*]] = load float, ptr %[[VAL_232]], align 4
-// CHECK2-NEXT: %[[VAL_234:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_235:.*]] = load float, ptr %[[VAL_234]], align 4
-// CHECK2-NEXT: %[[VAL_236:.*]] = fadd float %[[VAL_229]], %[[VAL_233]]
-// CHECK2-NEXT: %[[VAL_237:.*]] = fadd float %[[VAL_231]], %[[VAL_235]]
-// CHECK2-NEXT: %[[VAL_238:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_239:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_176]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_236]], ptr %[[VAL_238]], align 4
-// CHECK2-NEXT: store float %[[VAL_237]], ptr %[[VAL_239]], align 4
-// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_225]]
-// CHECK2: .omp.reduction.case2: ; preds = %[[VAL_222]]
-// CHECK2-NEXT: %[[VAL_240:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_241:.*]] = load float, ptr %[[VAL_240]], align 4
-// CHECK2-NEXT: %[[VAL_242:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_243:.*]] = load float, ptr %[[VAL_242]], align 4
-// CHECK2-NEXT: call void @__atomic_load(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], i32 noundef 0)
-// CHECK2-NEXT: br label %[[VAL_244:.*]]
-// CHECK2: atomic_cont: ; preds = %[[VAL_244]], %[[VAL_227]]
-// CHECK2-NEXT: %[[VAL_245:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_246:.*]] = load float, ptr %[[VAL_245]], align 4
-// CHECK2-NEXT: %[[VAL_247:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_167]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_248:.*]] = load float, ptr %[[VAL_247]], align 4
-// CHECK2-NEXT: %[[VAL_249:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_250:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_246]], ptr %[[VAL_249]], align 4
-// CHECK2-NEXT: store float %[[VAL_248]], ptr %[[VAL_250]], align 4
-// CHECK2-NEXT: %[[VAL_251:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_252:.*]] = load float, ptr %[[VAL_251]], align 4
-// CHECK2-NEXT: %[[VAL_253:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_169]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_254:.*]] = load float, ptr %[[VAL_253]], align 4
-// CHECK2-NEXT: %[[VAL_255:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_256:.*]] = load float, ptr %[[VAL_255]], align 4
-// CHECK2-NEXT: %[[VAL_257:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_163]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_258:.*]] = load float, ptr %[[VAL_257]], align 4
-// CHECK2-NEXT: %[[VAL_259:.*]] = fadd float %[[VAL_252]], %[[VAL_256]]
-// CHECK2-NEXT: %[[VAL_260:.*]] = fadd float %[[VAL_254]], %[[VAL_258]]
-// CHECK2-NEXT: %[[VAL_261:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_262:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_168]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_259]], ptr %[[VAL_261]], align 4
-// CHECK2-NEXT: store float %[[VAL_260]], ptr %[[VAL_262]], align 4
-// CHECK2-NEXT: %[[VAL_263:.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef %[[VAL_176]], ptr noundef %[[VAL_167]], ptr noundef %[[VAL_168]], i32 noundef 0, i32 noundef 0)
-// CHECK2-NEXT: br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_244]]
-// CHECK2: atomic_exit: ; preds = %[[VAL_244]]
-// CHECK2-NEXT: call void @__kmpc_end_reduce(ptr @3, i32 %[[VAL_182]], ptr @.gomp_critical_user_.reduction.var)
-// CHECK2-NEXT: br label %[[VAL_225]]
-// CHECK2: .omp.reduction.default: ; preds = %[[VAL_264]], %[[VAL_226]], %[[VAL_222]]
-// CHECK2-NEXT: %[[VAL_265:.*]] = load i32, ptr %[[VAL_161]], align 4
-// CHECK2-NEXT: %[[VAL_266:.*]] = icmp ne i32 %[[VAL_265]], 0
-// CHECK2-NEXT: br i1 %[[VAL_266]], label %[[VAL_267:.*]], label %[[VAL_268:.*]]
-// CHECK2: .omp.lastprivate.then: ; preds = %[[VAL_225]]
-// CHECK2-NEXT: store i32 10, ptr %[[VAL_162]], align 4
-// CHECK2-NEXT: %[[VAL_269:.*]] = load i32, ptr %[[VAL_162]], align 4
-// CHECK2-NEXT: store i32 %[[VAL_269]], ptr %[[VAL_153]], align 4
-// CHECK2-NEXT: br label %[[VAL_268]]
-// CHECK2: .omp.lastprivate.done: ; preds = %[[VAL_267]], %[[VAL_225]]
-// CHECK2-NEXT: ret void
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_271:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_272:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: store ptr %[[VAL_273:.*]], ptr %[[VAL_271]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_274:.*]], ptr %[[VAL_272]], align 4
-// CHECK2-NEXT: %[[VAL_275:.*]] = load ptr, ptr %[[VAL_271]], align 4
-// CHECK2-NEXT: %[[VAL_276:.*]] = load ptr, ptr %[[VAL_272]], align 4
-// CHECK2-NEXT: %[[VAL_277:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_276]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_278:.*]] = load ptr, ptr %[[VAL_277]], align 4
-// CHECK2-NEXT: %[[VAL_279:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_275]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_280:.*]] = load ptr, ptr %[[VAL_279]], align 4
-// CHECK2-NEXT: %[[VAL_281:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_282:.*]] = load float, ptr %[[VAL_281]], align 4
-// CHECK2-NEXT: %[[VAL_283:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_284:.*]] = load float, ptr %[[VAL_283]], align 4
-// CHECK2-NEXT: %[[VAL_285:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_278]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_286:.*]] = load float, ptr %[[VAL_285]], align 4
-// CHECK2-NEXT: %[[VAL_287:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_278]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_288:.*]] = load float, ptr %[[VAL_287]], align 4
-// CHECK2-NEXT: %[[VAL_289:.*]] = fadd float %[[VAL_282]], %[[VAL_286]]
-// CHECK2-NEXT: %[[VAL_290:.*]] = fadd float %[[VAL_284]], %[[VAL_288]]
-// CHECK2-NEXT: %[[VAL_291:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_292:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_280]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_289]], ptr %[[VAL_291]], align 4
-// CHECK2-NEXT: store float %[[VAL_290]], ptr %[[VAL_292]], align 4
-// CHECK2-NEXT: ret void
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l31.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
-// CHECK2-NEXT: entry:
-// CHECK2-NEXT: %[[VAL_293:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: %[[VAL_294:.*]] = alloca ptr, align 4
-// CHECK2-NEXT: store ptr %[[VAL_295:.*]], ptr %[[VAL_293]], align 4
-// CHECK2-NEXT: store ptr %[[VAL_296:.*]], ptr %[[VAL_294]], align 4
-// CHECK2-NEXT: %[[VAL_297:.*]] = load ptr, ptr %[[VAL_293]], align 4
-// CHECK2-NEXT: %[[VAL_298:.*]] = load ptr, ptr %[[VAL_294]], align 4
-// CHECK2-NEXT: %[[VAL_299:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_298]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_300:.*]] = load ptr, ptr %[[VAL_299]], align 4
-// CHECK2-NEXT: %[[VAL_301:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_297]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_302:.*]] = load ptr, ptr %[[VAL_301]], align 4
-// CHECK2-NEXT: %[[VAL_303:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_304:.*]] = load float, ptr %[[VAL_303]], align 4
-// CHECK2-NEXT: %[[VAL_305:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_306:.*]] = load float, ptr %[[VAL_305]], align 4
-// CHECK2-NEXT: %[[VAL_307:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_300]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_308:.*]] = load float, ptr %[[VAL_307]], align 4
-// CHECK2-NEXT: %[[VAL_309:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_300]], i32 0, i32 1
-// CHECK2-NEXT: %[[VAL_310:.*]] = load float, ptr %[[VAL_309]], align 4
-// CHECK2-NEXT: %[[VAL_311:.*]] = fadd float %[[VAL_304]], %[[VAL_308]]
-// CHECK2-NEXT: %[[VAL_312:.*]] = fadd float %[[VAL_306]], %[[VAL_310]]
-// CHECK2-NEXT: %[[VAL_313:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 0
-// CHECK2-NEXT: %[[VAL_314:.*]] = getelementptr inbounds { float, float }, ptr %[[VAL_302]], i32 0, i32 1
-// CHECK2-NEXT: store float %[[VAL_311]], ptr %[[VAL_313]], align 4
-// CHECK2-NEXT: store float %[[VAL_312]], ptr %[[VAL_314]], align 4
-// CHECK2-NEXT: ret void
\ No newline at end of file
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index ec5ddb117d5828..6c29f6555f4bf5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -628,8 +628,8 @@ class OpenMPIRBuilder {
/// \param ThreadID Optional parameter to pass in any existing ThreadID value.
///
/// \returns The insertion point after the barrier.
- InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
- bool ForceSimpleCall = false,
+ InsertPointTy createBarrier(const LocationDescription &Loc,
+ omp::Directive Kind, bool ForceSimpleCall = false,
bool CheckCancelFlag = true,
Value *ThreadID = nullptr);
@@ -1252,7 +1252,7 @@ class OpenMPIRBuilder {
StringRef ParentName = "");
/// Enum class for the RedctionGen CallBack type to be used.
- enum class ReductionGenCBTy { Clang, MLIR };
+ enum class ReductionGenCBKind { Clang, MLIR };
/// ReductionGen CallBack for Clang
///
@@ -1264,7 +1264,7 @@ class OpenMPIRBuilder {
/// return the RHSPtr it used for codegen, used for fixup later.
/// \param CurFn Optionally used by Clang to pass in the Current Function as
/// Clang context may be old.
- using ReductionGenCBClang =
+ using ReductionGenClangCBTy =
std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
Value **LHS, Value **RHS, Function *CurFn)>;
@@ -1273,25 +1273,25 @@ class OpenMPIRBuilder {
/// \param CodeGenIP InsertPoint for CodeGen.
/// \param LHS Pass in the LHS Value to be used for CodeGen.
/// \param RHS Pass in the RHS Value to be used for CodeGen.
- using ReductionGenCB = std::function<InsertPointTy(
+ using ReductionGenCBTy = std::function<InsertPointTy(
InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
/// Functions used to generate atomic reductions. Such functions take two
/// Values representing pointers to LHS and RHS of the reduction, as well as
/// the element type of these pointers. They are expected to atomically
/// update the LHS to the reduced value.
- using AtomicReductionGenCB =
+ using ReductionGenAtomicCBTy =
std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
/// Enum class for reduction evaluation types scalar, complex and aggregate.
- enum class EvaluationKindTy { Scalar, Complex, Aggregate };
+ enum class EvaluationKind { Scalar, Complex, Aggregate };
/// Information about an OpenMP reduction.
struct ReductionInfo {
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
- EvaluationKindTy EvaluationKind, ReductionGenCB ReductionGen,
- ReductionGenCBClang ReductionGenClang,
- AtomicReductionGenCB AtomicReductionGen)
+ EvaluationKind EvaluationKind, ReductionGenCBTy ReductionGen,
+ ReductionGenClangCBTy ReductionGenClang,
+ ReductionGenAtomicCBTy AtomicReductionGen)
: ElementType(ElementType), Variable(Variable),
PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
@@ -1299,7 +1299,7 @@ class OpenMPIRBuilder {
ReductionInfo(Value *PrivateVariable)
: ElementType(nullptr), Variable(nullptr),
PrivateVariable(PrivateVariable),
- EvaluationKind(EvaluationKindTy::Scalar), ReductionGen(),
+ EvaluationKind(EvaluationKind::Scalar), ReductionGen(),
ReductionGenClang(), AtomicReductionGen() {}
/// Reduction element type, must match pointee type of variable.
@@ -1312,23 +1312,23 @@ class OpenMPIRBuilder {
Value *PrivateVariable;
/// Reduction evaluation type - scalar, complex or aggregate.
- EvaluationKindTy EvaluationKind;
+ EvaluationKind EvaluationKind;
/// Callback for generating the reduction body. The IR produced by this will
/// be used to combine two values in a thread-safe context, e.g., under
/// lock or within the same thread, and therefore need not be atomic.
- ReductionGenCB ReductionGen;
+ ReductionGenCBTy ReductionGen;
/// Clang callback for generating the reduction body. The IR produced by
/// this will be used to combine two values in a thread-safe context, e.g.,
/// under lock or within the same thread, and therefore need not be atomic.
- ReductionGenCBClang ReductionGenClang;
+ ReductionGenClangCBTy ReductionGenClang;
/// Callback for generating the atomic reduction body, may be null. The IR
/// produced by this will be used to atomically combine two values during
/// reduction. If null, the implementation will use the non-atomic version
/// along with the appropriate synchronization mechanisms.
- AtomicReductionGenCB AtomicReductionGen;
+ ReductionGenAtomicCBTy AtomicReductionGen;
};
enum class CopyAction : unsigned {
@@ -1574,7 +1574,7 @@ class OpenMPIRBuilder {
/// \return The reduction function.
Function *createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
- ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ ReductionGenCBKind ReductionGenCBTy = ReductionGenCBKind::MLIR,
AttributeList FuncAttrs = {});
public:
@@ -1843,7 +1843,7 @@ class OpenMPIRBuilder {
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
bool IsNoWait = false, bool IsTeamsReduction = false,
bool HasDistribute = false,
- ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+ ReductionGenCBKind ReductionGenCBTy = ReductionGenCBKind::MLIR,
std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
Value *SrcLocInfo = nullptr);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf49d7039c0e96..5a23468da6b45d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2348,13 +2348,13 @@ void OpenMPIRBuilder::emitReductionListCopy(
RemoteLaneOffset, ReductionArrayTy);
} else {
switch (RI.EvaluationKind) {
- case EvaluationKindTy::Scalar: {
+ case EvaluationKind::Scalar: {
Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
// Store the source element value to the dest element address.
Builder.CreateStore(Elem, DestElementAddr);
break;
}
- case EvaluationKindTy::Complex: {
+ case EvaluationKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, SrcElementAddr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -2372,7 +2372,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKindTy::Aggregate: {
+ case EvaluationKind::Aggregate: {
Value *SizeVal = Builder.getInt64(
M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -2869,12 +2869,12 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
switch (RI.EvaluationKind) {
- case EvaluationKindTy::Scalar: {
+ case EvaluationKind::Scalar: {
Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
Builder.CreateStore(TargetElement, GlobVal);
break;
}
- case EvaluationKindTy::Complex: {
+ case EvaluationKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, ElemPtr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -2892,7 +2892,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKindTy::Aggregate: {
+ case EvaluationKind::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -3059,12 +3059,12 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
switch (RI.EvaluationKind) {
- case EvaluationKindTy::Scalar: {
+ case EvaluationKind::Scalar: {
Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
Builder.CreateStore(TargetElement, ElemPtr);
break;
}
- case EvaluationKindTy::Complex: {
+ case EvaluationKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, GlobValPtr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -3082,7 +3082,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKindTy::Aggregate: {
+ case EvaluationKind::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -3190,7 +3190,7 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
Function *OpenMPIRBuilder::createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
- ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
+ ReductionGenCBKind ReductionGenCBTy, AttributeList FuncAttrs) {
auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
{Builder.getPtrTy(), Builder.getPtrTy()},
/* IsVarArg */ false);
@@ -3247,7 +3247,7 @@ Function *OpenMPIRBuilder::createReductionFunction(
Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
- if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ if (ReductionGenCBTy == ReductionGenCBKind::Clang) {
LHSPtrs.emplace_back(LHSPtr);
RHSPtrs.emplace_back(RHSPtr);
} else {
@@ -3261,7 +3261,7 @@ Function *OpenMPIRBuilder::createReductionFunction(
}
}
- if (ReductionGenCBTy == ReductionGenCBTy::Clang)
+ if (ReductionGenCBTy == ReductionGenCBKind::Clang)
for (auto En : enumerate(ReductionInfos)) {
unsigned Index = En.index();
const ReductionInfo &RI = En.value();
@@ -3311,7 +3311,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
- ReductionGenCBTy ReductionGenCBTy, std::optional<omp::GV> GridValue,
+ ReductionGenCBKind ReductionGenCBTy, std::optional<omp::GV> GridValue,
unsigned ReductionBufNum, Value *SrcLocInfo) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -3470,7 +3470,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
Value *RHS =
Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
- if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+ if (ReductionGenCBTy == ReductionGenCBKind::Clang) {
Value *LHSPtr, *RHSPtr;
Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
&LHSPtr, &RHSPtr, CurFunc));
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index de4b6d6c226a81..ff7bb8a06954ac 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4981,11 +4981,11 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
OpenMPIRBuilder::ReductionInfo ReductionInfos[] = {
{SumType, SumReduced, SumPrivatized,
- /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
- sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction},
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKind::Scalar, sumReduction,
+ /*ReductionGenClang=*/nullptr, sumAtomicReduction},
{XorType, XorReduced, XorPrivatized,
- /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
- xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKind::Scalar, xorReduction,
+ /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
OMPBuilder.Config.setIsGPU(false);
OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos);
@@ -5238,12 +5238,12 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
OMPBuilder.createReductions(
FirstBodyIP, FirstBodyAllocaIP,
{{SumType, SumReduced, SumPrivatized,
- /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKind::Scalar,
sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction}});
OMPBuilder.createReductions(
SecondBodyIP, SecondBodyAllocaIP,
{{XorType, XorReduced, XorPrivatized,
- /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKind::Scalar,
xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}});
Builder.restoreIP(AfterIP);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e9c231bf499b11..1236a812ad1d0d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -847,7 +847,7 @@ static void collectReductionInfo(
// Collect the reduction information.
reductionInfos.reserve(numReductions);
for (unsigned i = 0; i < numReductions; ++i) {
- llvm::OpenMPIRBuilder::AtomicReductionGenCB atomicGen = nullptr;
+ llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr;
if (owningAtomicReductionGens[i])
atomicGen = owningAtomicReductionGens[i];
llvm::Value *variable =
@@ -855,7 +855,7 @@ static void collectReductionInfo(
reductionInfos.push_back(
{moduleTranslation.convertType(reductionDecls[i].getType()), variable,
privateReductionVariables[i],
- /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
+ /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKind::Scalar,
owningReductionGens[i],
/*ReductionGenClang=*/nullptr, atomicGen});
}
>From e507ae1f2cd32821e68bb9d74dd343fbc26496a7 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Thu, 28 Mar 2024 14:40:56 +0000
Subject: [PATCH 17/18] Fix Clang CI build error.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 8 ++++----
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 11 +++++------
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 18 +++++++++---------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 2 +-
4 files changed, 19 insertions(+), 20 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 92b8c60d1ebc1a..5a1cac08dcc836 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1693,16 +1693,16 @@ void CGOpenMPRuntimeGPU::emitReduction(
const auto *LHSVar =
cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
Variable = CGF.GetAddrOfLocalVar(LHSVar).getPointer();
- llvm::OpenMPIRBuilder::EvaluationKind EvalKind;
+ llvm::OpenMPIRBuilder::EvalKind EvalKind;
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Scalar;
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
break;
case TEK_Complex:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Complex;
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
break;
case TEK_Aggregate:
- EvalKind = llvm::OpenMPIRBuilder::EvaluationKind::Aggregate;
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
break;
}
auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 6c29f6555f4bf5..458d6e870b2c3d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1284,12 +1284,12 @@ class OpenMPIRBuilder {
std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
/// Enum class for reduction evaluation types scalar, complex and aggregate.
- enum class EvaluationKind { Scalar, Complex, Aggregate };
+ enum class EvalKind { Scalar, Complex, Aggregate };
/// Information about an OpenMP reduction.
struct ReductionInfo {
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
- EvaluationKind EvaluationKind, ReductionGenCBTy ReductionGen,
+ EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
ReductionGenClangCBTy ReductionGenClang,
ReductionGenAtomicCBTy AtomicReductionGen)
: ElementType(ElementType), Variable(Variable),
@@ -1298,9 +1298,8 @@ class OpenMPIRBuilder {
AtomicReductionGen(AtomicReductionGen) {}
ReductionInfo(Value *PrivateVariable)
: ElementType(nullptr), Variable(nullptr),
- PrivateVariable(PrivateVariable),
- EvaluationKind(EvaluationKind::Scalar), ReductionGen(),
- ReductionGenClang(), AtomicReductionGen() {}
+ PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
+ ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
/// Reduction element type, must match pointee type of variable.
Type *ElementType;
@@ -1312,7 +1311,7 @@ class OpenMPIRBuilder {
Value *PrivateVariable;
/// Reduction evaluation type - scalar, complex or aggregate.
- EvaluationKind EvaluationKind;
+ EvalKind EvaluationKind;
/// Callback for generating the reduction body. The IR produced by this will
/// be used to combine two values in a thread-safe context, e.g., under
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5a23468da6b45d..860957ab947bdd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2348,13 +2348,13 @@ void OpenMPIRBuilder::emitReductionListCopy(
RemoteLaneOffset, ReductionArrayTy);
} else {
switch (RI.EvaluationKind) {
- case EvaluationKind::Scalar: {
+ case EvalKind::Scalar: {
Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
// Store the source element value to the dest element address.
Builder.CreateStore(Elem, DestElementAddr);
break;
}
- case EvaluationKind::Complex: {
+ case EvalKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, SrcElementAddr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -2372,7 +2372,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKind::Aggregate: {
+ case EvalKind::Aggregate: {
Value *SizeVal = Builder.getInt64(
M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -2869,12 +2869,12 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
switch (RI.EvaluationKind) {
- case EvaluationKind::Scalar: {
+ case EvalKind::Scalar: {
Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
Builder.CreateStore(TargetElement, GlobVal);
break;
}
- case EvaluationKind::Complex: {
+ case EvalKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, ElemPtr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -2892,7 +2892,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKind::Aggregate: {
+ case EvalKind::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
@@ -3059,12 +3059,12 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
switch (RI.EvaluationKind) {
- case EvaluationKind::Scalar: {
+ case EvalKind::Scalar: {
Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
Builder.CreateStore(TargetElement, ElemPtr);
break;
}
- case EvaluationKind::Complex: {
+ case EvalKind::Complex: {
Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
RI.ElementType, GlobValPtr, 0, 0, ".realp");
Value *SrcReal = Builder.CreateLoad(
@@ -3082,7 +3082,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
Builder.CreateStore(SrcImg, DestImgPtr);
break;
}
- case EvaluationKind::Aggregate: {
+ case EvalKind::Aggregate: {
Value *SizeVal =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
Builder.CreateMemCpy(
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1236a812ad1d0d..59cd98e8b969c6 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -855,7 +855,7 @@ static void collectReductionInfo(
reductionInfos.push_back(
{moduleTranslation.convertType(reductionDecls[i].getType()), variable,
privateReductionVariables[i],
- /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKind::Scalar,
+ /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
owningReductionGens[i],
/*ReductionGenClang=*/nullptr, atomicGen});
}
>From 5534943dc5d4e0cd40bf61eaf5a063090b466167 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 2 Apr 2024 17:02:06 +0100
Subject: [PATCH 18/18] Rebased.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 5a1cac08dcc836..4fd48e957cdb67 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1689,10 +1689,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
ElementType = CGF.ConvertTypeForMem(Private->getType());
const auto *RHSVar =
cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
- PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getPointer();
+ PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
const auto *LHSVar =
cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
- Variable = CGF.GetAddrOfLocalVar(LHSVar).getPointer();
+ Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
llvm::OpenMPIRBuilder::EvalKind EvalKind;
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar:
@@ -1714,10 +1714,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
*LHSPtr = CGF.GetAddrOfLocalVar(
cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
- .getPointer();
+ .emitRawPointer(CGF);
*RHSPtr = CGF.GetAddrOfLocalVar(
cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
- .getPointer();
+ .emitRawPointer(CGF);
emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
cast<DeclRefExpr>(LHSExprs[I]),
More information about the cfe-commits
mailing list