[clang] f9a89e6 - [OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Nov 1 11:11:53 PDT 2023
Author: Johannes Doerfert
Date: 2023-11-01T11:11:48-07:00
New Revision: f9a89e6b9c4345df978bf7cbfedfd2b250029278
URL: https://github.com/llvm/llvm-project/commit/f9a89e6b9c4345df978bf7cbfedfd2b250029278
DIFF: https://github.com/llvm/llvm-project/commit/f9a89e6b9c4345df978bf7cbfedfd2b250029278.diff
LOG: [OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752)
We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.
Fixes: https://github.com/llvm/llvm-project/issues/70249
Added:
openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
Modified:
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
openmp/libomptarget/DeviceRTL/include/Interface.h
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
openmp/libomptarget/include/Environment.h
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index bd9329b8e2d4113..0ed665e0dfb9722 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
+ // This is temporary until we remove the fixed sized buffer.
+ ASTContext &C = CGM.getContext();
+ RecordDecl *StaticRD = C.buildImplicitRecord(
+ "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+ StaticRD->startDefinition();
+ for (const RecordDecl *TeamReductionRec : TeamsReductions) {
+ QualType RecTy = C.getRecordType(TeamReductionRec);
+ auto *Field = FieldDecl::Create(
+ C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
+ C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
+ Field->setAccess(AS_public);
+ StaticRD->addDecl(Field);
+ }
+ StaticRD->completeDefinition();
+ QualType StaticTy = C.getRecordType(StaticRD);
+ llvm::Type *LLVMReductionsBufferTy =
+ CGM.getTypes().ConvertTypeForMem(StaticTy);
+ const auto &DL = CGM.getModule().getDataLayout();
+ uint64_t BufferSize =
+ DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld);
+ OMPBuilder.createTargetDeinit(Bld, BufferSize);
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
- if (!KernelTeamsReductionPtr) {
- KernelTeamsReductionPtr = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::InternalLinkage, nullptr,
- "_openmp_teams_reductions_buffer_$_$ptr");
- }
- llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
- Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
- /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
+ auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
+ {}, "_openmp_teams_reductions_buffer_$_$ptr");
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Value *Args[] = {
RTLoc,
ThreadId,
- GlobalBufferPtr,
+ KernelTeamsReductionPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}
-void CGOpenMPRuntimeGPU::clear() {
-
- if (!TeamsReductions.empty()) {
- ASTContext &C = CGM.getContext();
- RecordDecl *StaticRD = C.buildImplicitRecord(
- "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
- StaticRD->startDefinition();
- for (const RecordDecl *TeamReductionRec : TeamsReductions) {
- QualType RecTy = C.getRecordType(TeamReductionRec);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
- C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
- }
- StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- //
diff erent size are reported as erroneous).
- // Restore CommonLinkage as soon as nvlink is fixed.
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMReductionsBufferTy,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
- llvm::Constant::getNullValue(LLVMReductionsBufferTy),
- "_openmp_teams_reductions_buffer_$_");
- KernelTeamsReductionPtr->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
- }
- CGOpenMPRuntime::clear();
-}
-
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 46e1361f2f895ba..141436f26230dde 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
- void clear() override;
bool isGPU() const override { return true; };
@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
- llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams
/// reductions.
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index f4ec40b030a41c1..137ef3861751bb8 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -97,14 +97,14 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: store ptr [[E1]], ptr [[TMP4]], align 8
-// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK1-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK1-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
-// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK1-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK1-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -386,21 +386,21 @@ int bar(int n){
// CHECK1-NEXT: store ptr [[C1]], ptr [[TMP6]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: store ptr [[D2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
-// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK1-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK1-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK1-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK1-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK1-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -727,29 +727,29 @@ int bar(int n){
// CHECK1-NEXT: store ptr [[A1]], ptr [[TMP6]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: store ptr [[B2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
-// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK1-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK1-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
-// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
-// CHECK1-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
-// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK1-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -1157,13 +1157,13 @@ int bar(int n){
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -1183,11 +1183,11 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1209,13 +1209,13 @@ int bar(int n){
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -1235,11 +1235,11 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1294,14 +1294,14 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK2-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK2-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK2-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK2-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
-// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK2-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK2-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK2-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -1583,21 +1583,21 @@ int bar(int n){
// CHECK2-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
-// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK2-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK2-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK2-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK2-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -1924,29 +1924,29 @@ int bar(int n){
// CHECK2-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
-// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK2-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK2-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK2: cond.true:
-// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK2-NEXT: br label [[COND_END:%.*]]
// CHECK2: cond.false:
-// CHECK2-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK2-NEXT: br label [[COND_END]]
// CHECK2: cond.end:
-// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK2-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -2354,13 +2354,13 @@ int bar(int n){
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -2380,11 +2380,11 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2406,13 +2406,13 @@ int bar(int n){
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -2432,11 +2432,11 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2491,14 +2491,14 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK3-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK3-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK3-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK3-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
-// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK3-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK3-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -2780,21 +2780,21 @@ int bar(int n){
// CHECK3-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
-// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK3-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK3-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK3-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK3-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK3-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -3121,29 +3121,29 @@ int bar(int n){
// CHECK3-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
-// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK3-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK3-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK3: cond.true:
-// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK3-NEXT: br label [[COND_END:%.*]]
// CHECK3: cond.false:
-// CHECK3-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK3-NEXT: br label [[COND_END]]
// CHECK3: cond.end:
-// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK3-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@@ -3551,13 +3551,13 @@ int bar(int n){
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -3577,11 +3577,11 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -3603,13 +3603,13 @@ int bar(int n){
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -3629,11 +3629,11 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 63926bb4440811e..f9aef3acb1c611e 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1328,24 +1328,24 @@ int foo() {
// IR-GPU-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
// IR-GPU-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
-// IR-GPU-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// IR-GPU-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// IR-GPU-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
-// IR-GPU-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// IR-GPU-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// IR-GPU-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// IR-GPU-NEXT: [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
+// IR-GPU-NEXT: br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// IR-GPU: .omp.reduction.then:
-// IR-GPU-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
-// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
+// IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
// IR-GPU: omp.arraycpy.body:
// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
-// IR-GPU-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
-// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
+// IR-GPU-NEXT: [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
// IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
// IR-GPU: omp.arraycpy.done17:
// IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 6ce56475c09b37f..a603e91d1182d41 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -234,6 +234,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId);
void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
+void *__kmpc_reduction_get_fixed_buffer();
+
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index a041d239e1abb44..efa09cafa879ec1 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -167,9 +167,6 @@ uint32_t roundToWarpsize(uint32_t s) {
uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-static uint32_t IterCnt = 0;
-static uint32_t Cnt = 0;
-
} // namespace
extern "C" {
@@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
ThreadId = 0;
}
+ uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
+ uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
@@ -313,4 +313,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId) {}
void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
}
+void *__kmpc_reduction_get_fixed_buffer() {
+ return state::getKernelLaunchEnvironment().ReductionBuffer;
+}
+
#pragma omp end declare target
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 9c02e2390581dcd..bd493e8a0be78f1 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -77,15 +77,16 @@ struct DynamicEnvironmentTy {
// NOTE: Please don't change the order of those members as their indices are
// used in the middle end. Always add the new data member at the end.
struct ConfigurationEnvironmentTy {
- uint8_t UseGenericStateMachine;
- uint8_t MayUseNestedParallelism;
- llvm::omp::OMPTgtExecModeFlags ExecMode;
+ uint8_t UseGenericStateMachine = 2;
+ uint8_t MayUseNestedParallelism = 2;
+ llvm::omp::OMPTgtExecModeFlags ExecMode = llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
// Information about (legal) launch configurations.
//{
- int32_t MinThreads;
- int32_t MaxThreads;
- int32_t MinTeams;
- int32_t MaxTeams;
+ int32_t MinThreads = -1;
+ int32_t MaxThreads = -1;
+ int32_t MinTeams = -1;
+ int32_t MaxTeams = -1;
+ int32_t ReductionBufferSize = 0;
//}
};
@@ -93,10 +94,14 @@ struct ConfigurationEnvironmentTy {
// used in the middle end. Always add the new data member at the end.
struct KernelEnvironmentTy {
ConfigurationEnvironmentTy Configuration;
- IdentTy *Ident;
- DynamicEnvironmentTy *DynamicEnv;
+ IdentTy *Ident = nullptr;
+ DynamicEnvironmentTy *DynamicEnv = nullptr;
};
-struct KernelLaunchEnvironmentTy {};
+struct KernelLaunchEnvironmentTy {
+ uint32_t ReductionCnt = 0;
+ uint32_t ReductionIterCnt = 0;
+ void *ReductionBuffer = nullptr;
+};
#endif // _OMPTARGET_ENVIRONMENT_H_
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 69943486aa72055..8747502065a5937 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -402,9 +402,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
DP("Failed to read kernel environment for '%s': %s\n"
"Using default SPMD (2) execution mode\n",
Name, ErrStr.data());
- KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
- KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
- KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
+ assert(KernelEnvironment.Configuration.ReductionBufferSize == 0 &&
+ "Default initialization failed.");
}
// Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
@@ -441,6 +440,17 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
+ if (KernelEnvironment.Configuration.ReductionBufferSize) {
+ auto AllocOrErr = GenericDevice.dataAlloc(
+ KernelEnvironment.Configuration.ReductionBufferSize,
+ /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+ if (!AllocOrErr)
+ return AllocOrErr.takeError();
+ LocalKLE.ReductionBuffer = *AllocOrErr;
+ // Remember to free the memory later.
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+ }
+
auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
sizeof(KernelLaunchEnvironmentTy),
AsyncInfoWrapper);
diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
new file mode 100644
index 000000000000000..5303a9463f15516
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+// FIXME: This is a bug in host offload, this should run fine.
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <iostream>
+#include <vector>
+
+#define N 8
+
+int main() {
+ std::vector<int> avec(N);
+ int *a = avec.data();
+#pragma omp parallel for
+ for (int i = 0; i < N; i++) {
+ a[i] = 0;
+#pragma omp target teams distribute parallel for reduction(+ : a[i])
+ for (int j = 0; j < N; j++)
+ a[i] += 1;
+ }
+
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ // CHECK: 8
+ for (int i = 0; i < N; i++)
+ std::cout << a[i] << std::endl;
+}
More information about the cfe-commits
mailing list