[Openmp-commits] [openmp] [clang] [OpenMP] Team reduction work specialization (PR #70766)

Tue Oct 31 22:39:19 PDT 2023

https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/70766

>From 04aafdce6f259e31304ed47118a56042b155bd77 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 30 Oct 2023 16:39:00 -0700
Subject: [PATCH 1/2] [OpenMP][FIX] Allocate per launch memory for GPU team
 reductions

We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.

Fixes: https://github.com/llvm/llvm-project/issues/70249
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  75 ++----
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.h        |   2 -
 .../OpenMP/nvptx_teams_reduction_codegen.cpp  | 240 +++++++++---------
 .../target_teams_generic_loop_codegen.cpp     |  20 +-
 .../DeviceRTL/include/Interface.h             |   2 +
 .../libomptarget/DeviceRTL/src/Reduction.cpp  |  10 +-
 openmp/libomptarget/include/Environment.h     |   7 +-
 .../PluginInterface/PluginInterface.cpp       |  11 +
 .../common/PluginInterface/PluginInterface.h  |   2 +-
 .../parallel_target_teams_reduction.cpp       |  36 +++
 10 files changed, 221 insertions(+), 184 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index bd9329b8e2d4113..0ed665e0dfb9722 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
   if (!IsSPMD)
     emitGenericVarsEpilog(CGF);
 
+  // This is temporary until we remove the fixed sized buffer.
+  ASTContext &C = CGM.getContext();
+  RecordDecl *StaticRD = C.buildImplicitRecord(
+      "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+  StaticRD->startDefinition();
+  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
+    QualType RecTy = C.getRecordType(TeamReductionRec);
+    auto *Field = FieldDecl::Create(
+        C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
+        C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
+        /*BW=*/nullptr, /*Mutable=*/false,
+        /*InitStyle=*/ICIS_NoInit);
+    Field->setAccess(AS_public);
+    StaticRD->addDecl(Field);
+  }
+  StaticRD->completeDefinition();
+  QualType StaticTy = C.getRecordType(StaticRD);
+  llvm::Type *LLVMReductionsBufferTy =
+      CGM.getTypes().ConvertTypeForMem(StaticTy);
+  const auto &DL = CGM.getModule().getDataLayout();
+  uint64_t BufferSize =
+      DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
   CGBuilderTy &Bld = CGF.Builder;
-  OMPBuilder.createTargetDeinit(Bld);
+  OMPBuilder.createTargetDeinit(Bld, BufferSize);
 }
 
 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
         CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
         C.getLangOpts().OpenMPCUDAReductionBufNum);
     TeamsReductions.push_back(TeamReductionRec);
-    if (!KernelTeamsReductionPtr) {
-      KernelTeamsReductionPtr = new llvm::GlobalVariable(
-          CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
-          llvm::GlobalValue::InternalLinkage, nullptr,
-          "_openmp_teams_reductions_buffer_$_$ptr");
-    }
-    llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
-        Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
-        /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
+    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
+        {}, "_openmp_teams_reductions_buffer_$_$ptr");
     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
     llvm::Value *Args[] = {
         RTLoc,
         ThreadId,
-        GlobalBufferPtr,
+        KernelTeamsReductionPtr,
         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
         RL,
         ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   CGOpenMPRuntime::processRequiresDirective(D);
 }
 
-void CGOpenMPRuntimeGPU::clear() {
-
-  if (!TeamsReductions.empty()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *StaticRD = C.buildImplicitRecord(
-        "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
-    StaticRD->startDefinition();
-    for (const RecordDecl *TeamReductionRec : TeamsReductions) {
-      QualType RecTy = C.getRecordType(TeamReductionRec);
-      auto *Field = FieldDecl::Create(
-          C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
-          C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
-          /*BW=*/nullptr, /*Mutable=*/false,
-          /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      StaticRD->addDecl(Field);
-    }
-    StaticRD->completeDefinition();
-    QualType StaticTy = C.getRecordType(StaticRD);
-    llvm::Type *LLVMReductionsBufferTy =
-        CGM.getTypes().ConvertTypeForMem(StaticTy);
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore CommonLinkage as soon as nvlink is fixed.
-    auto *GV = new llvm::GlobalVariable(
-        CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-        llvm::Constant::getNullValue(LLVMReductionsBufferTy),
-        "_openmp_teams_reductions_buffer_$_");
-    KernelTeamsReductionPtr->setInitializer(
-        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
-                                                             CGM.VoidPtrTy));
-  }
-  CGOpenMPRuntime::clear();
-}
-
 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   llvm::Module *M = &CGF.CGM.getModule();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 46e1361f2f895ba..141436f26230dde 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
 
 public:
   explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
-  void clear() override;
 
   bool isGPU() const override { return true; };
 
@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   /// Maps the function to the list of the globalized variables with their
   /// addresses.
   llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
-  llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
   /// List of the records with the list of fields for the reductions across the
   /// teams. Used to build the intermediate buffer for the fast teams
   /// reductions.
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index f4ec40b030a41c1..137ef3861751bb8 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -97,14 +97,14 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK1-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK1-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK1-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -386,21 +386,21 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
 // CHECK1-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK1-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK1-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK1-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK1-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK1-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK1-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -727,29 +727,29 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
 // CHECK1-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK1-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK1-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1157,13 +1157,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -1183,11 +1183,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1209,13 +1209,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -1235,11 +1235,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1294,14 +1294,14 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
 // CHECK2-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK2-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK2-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK2-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK2-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK2-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1583,21 +1583,21 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK2-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK2-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK2-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK2-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK2-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK2-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK2-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK2-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK2-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK2-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1924,29 +1924,29 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK2-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK2-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
-// CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK2-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -2354,13 +2354,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -2380,11 +2380,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2406,13 +2406,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -2432,11 +2432,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2491,14 +2491,14 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK3-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK3-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK3-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK3-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK3-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK3-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK3-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -2780,21 +2780,21 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK3-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK3-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK3-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK3-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK3-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK3-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK3-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK3-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK3-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK3-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -3121,29 +3121,29 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK3-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK3-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK3-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK3:       cond.true:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK3-NEXT:    br label [[COND_END:%.*]]
 // CHECK3:       cond.false:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK3-NEXT:    br label [[COND_END]]
 // CHECK3:       cond.end:
-// CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK3-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -3551,13 +3551,13 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -3577,11 +3577,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -3603,13 +3603,13 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -3629,11 +3629,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 63926bb4440811e..f9aef3acb1c611e 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1328,24 +1328,24 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
 // IR-GPU-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
-// IR-GPU-NEXT:    [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// IR-GPU-NEXT:    [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// IR-GPU-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
-// IR-GPU-NEXT:    br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// IR-GPU-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// IR-GPU-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// IR-GPU-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
+// IR-GPU-NEXT:    br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // IR-GPU:       .omp.reduction.then:
-// IR-GPU-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
-// IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
+// IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // IR-GPU:       omp.arraycpy.body:
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// IR-GPU-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
-// IR-GPU-NEXT:    [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
-// IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
+// IR-GPU-NEXT:    [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// IR-GPU-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
+// IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
 // IR-GPU:       omp.arraycpy.done17:
 // IR-GPU-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 6ce56475c09b37f..a603e91d1182d41 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -234,6 +234,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId);
 
 void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
 
+void *__kmpc_reduction_get_fixed_buffer();
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index a041d239e1abb44..efa09cafa879ec1 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -167,9 +167,6 @@ uint32_t roundToWarpsize(uint32_t s) {
 
 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 
-static uint32_t IterCnt = 0;
-static uint32_t Cnt = 0;
-
 } // namespace
 
 extern "C" {
@@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     ThreadId = 0;
   }
 
+  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
+  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
@@ -313,4 +313,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId) {}
 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
 }
 
+void *__kmpc_reduction_get_fixed_buffer() {
+  return state::getKernelLaunchEnvironment().ReductionBuffer;
+}
+
 #pragma omp end declare target
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 9c02e2390581dcd..b1607c025bbc6a9 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -86,6 +86,7 @@ struct ConfigurationEnvironmentTy {
   int32_t MaxThreads;
   int32_t MinTeams;
   int32_t MaxTeams;
+  int32_t ReductionBufferSize;
   //}
 };
 
@@ -97,6 +98,10 @@ struct KernelEnvironmentTy {
   DynamicEnvironmentTy *DynamicEnv;
 };
 
-struct KernelLaunchEnvironmentTy {};
+struct KernelLaunchEnvironmentTy {
+  uint32_t ReductionCnt;
+  uint32_t ReductionIterCnt;
+  void *ReductionBuffer;
+};
 
 #endif // _OMPTARGET_ENVIRONMENT_H_
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 69943486aa72055..50d1433f923bd7a 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -441,6 +441,17 @@ GenericKernelTy::getKernelLaunchEnvironment(
   /// async data transfer.
   auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
   LocalKLE = KernelLaunchEnvironment;
+  if (KernelEnvironment.Configuration.ReductionBufferSize) {
+    auto AllocOrErr = GenericDevice.dataAlloc(
+        KernelEnvironment.Configuration.ReductionBufferSize,
+        /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+    if (!AllocOrErr)
+      return AllocOrErr.takeError();
+    LocalKLE.ReductionBuffer = *AllocOrErr;
+    // Remember to free the memory later.
+    AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+  }
+
   auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
                                       sizeof(KernelLaunchEnvironmentTy),
                                       AsyncInfoWrapper);
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index bbd6acd19bb01ae..2a139e1f266b5c6 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -387,7 +387,7 @@ struct GenericKernelTy {
   KernelEnvironmentTy KernelEnvironment;
 
   /// The prototype kernel launch environment.
-  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+  KernelLaunchEnvironmentTy KernelLaunchEnvironment = {0, 0};
 };
 
 /// Class representing a map of host pinned allocations. We track these pinned
diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
new file mode 100644
index 000000000000000..5303a9463f15516
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+// FIXME: This is a bug in host offload, this should run fine.
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <iostream>
+#include <vector>
+
+#define N 8
+
+int main() {
+  std::vector<int> avec(N);
+  int *a = avec.data();
+#pragma omp parallel for
+  for (int i = 0; i < N; i++) {
+    a[i] = 0;
+#pragma omp target teams distribute parallel for reduction(+ : a[i])
+    for (int j = 0; j < N; j++)
+      a[i] += 1;
+  }
+
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  for (int i = 0; i < N; i++)
+    std::cout << a[i] << std::endl;
+}

>From 9c557e131271a7848966c40098c69b0a430e0c32 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 30 Oct 2023 22:31:45 -0700
Subject: [PATCH 2/2] [OpenMP] Provide a specialized team reduction for the
 common case

We default to < 1024 teams if the user did not specify otherwise. As
such we can avoid the extra logic in the teams reduction that handles
more than num_of_records (default 1024) teams. This is a stopgap but
still shaves of 33% of the runtime in some simple reduction examples.
---
 .../libomptarget/DeviceRTL/src/Reduction.cpp  | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index efa09cafa879ec1..51bc16bdfd18e1f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -178,11 +178,118 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
                                       false);
 }
 
+/// Mostly like _v2 but with the builtin assumption that we have less than
+/// num_of_records (by default 1024) teams.
+int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
+    IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
+    uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
+    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
+    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  uint32_t ThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isGenericMode()) {
+    if (!mapping::isMainThreadInGenericMode())
+      return 0;
+    ThreadId = 0;
+  }
+
+  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t TeamId = omp_get_team_num();
+  uint32_t NumTeams = omp_get_num_teams();
+  static unsigned SHARED(ChunkTeamCount);
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = (ThreadId == 0);
+
+  if (IsMaster) {
+    lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
+                                 atomic::MemScopeTy::device);
+  }
+  // Synchronize
+  if (mapping::isSPMDMode())
+    synchronize::threadsAligned(atomic::acq_rel);
+  else
+    fence::kernel(atomic::acq_rel);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     number of teams
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  if (ChunkTeamCount != NumTeams - 1)
+    return 0;
+
+  // Last team processing.
+  NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
+  if (ThreadId >= NumThreads)
+    return 0;
+
+  // Load from buffer and reduce.
+  glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+    glredFct(GlobalBuffer, i, reduce_data);
+
+  // Reduce across warps to the warp master.
+  gpu_regular_warp_reduce(reduce_data, shflFct);
+
+  uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
+  uint32_t WarpsNeeded =
+      (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+  // Gather all the reduced values from each warp
+  // to the first warp.
+  cpyFct(reduce_data, WarpsNeeded);
+
+  if (mapping::getWarpIdInBlock() == 0)
+    gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+
+  return IsMaster;
+}
+
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
     ListGlobalFnTy glredFct) {
+  // The first check is a compile time constant, the second one a runtime check.
+  // If the first one succeeds we will use the specialized version.
+  if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
+       state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
+       num_of_records == 1024) ||
+      (omp_get_num_teams() <= num_of_records))
+    return __kmpc_nvptx_teams_reduce_nowait_v3(
+        Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
+        lgcpyFct, lgredFct, glcpyFct, glredFct);
+
   // Terminate all threads in non-SPMD mode except for the master thread.
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (mapping::isGenericMode()) {