[clang] [llvm] OpenMP offload 'simd' directive (PR #91261)

Mon May 6 12:42:34 PDT 2024

https://github.com/efwright created https://github.com/llvm/llvm-project/pull/91261

This is a portion of the implementation for the 'simd' directive for OpenMP offload. Included in this PR is new code generation for OpenMP 'simd' loops using the OpenMPIRBuilder, and some adjustments to how threads are mapped in the runtime that is eventually needed to support the extra level of thread parallelism that 'simd' brings.

There currently is no way for the programmer to adjust the 'simdlen', so it will use a default value of 1. For now, this means that using 'teams', 'parallel' and 'simd' together should currently behave the same as before, since the inner-most level of parallelism is assigned a length of 1.

To summarize the code generation - The loop has three important parts we need to generate code for:
1. LoopProlog - where the trip count of the loop is determined and any reduction/privatized variables are initialized
2. LoopBody - This is effectively the body of the loop. Some additional code is generated to convert a given "iteration variable" into a "loop variable". This code will then be outlined so that it can be passed as a "workload" into the runtime.
3. LoopEpilog - where reductions are finalized.

There is also an additional step in the code generation that is currently not being used due to it being unneeded without the full runtime implementation. This is where the loop is being generated for 'generic' mode, i.e if the containing 'parallel' region has code that the 'simd' threads should not hit. The relationship between 'parallel' and 'simd' in generic mode is very similar to the relationship of 'teams' and 'parallel' in generic mode, though they are independent (i.e you can have 'teams-parallel' be executed in SPMD mode and 'parallel-simd' executed in generic mode.

>From 61146e27d804e0fff3711c3defb7064384ccc332 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzansel61.coral.llnl.gov>
Date: Mon, 6 May 2024 12:20:44 -0700
Subject: [PATCH] OpenMP offload 'simd' directive

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |    2 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  239 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  187 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |    2 +-
 clang/lib/Parse/ParseOpenMP.cpp               |    6 +-
 .../target_teams_generic_loop_codegen.cpp     |   12 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  639 ++++-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   12 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 2121 ++++++++++++++++-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   13 +-
 offload/DeviceRTL/include/Interface.h         |   11 +
 offload/DeviceRTL/include/Mapping.h           |    7 +
 offload/DeviceRTL/src/Kernel.cpp              |    4 +-
 offload/DeviceRTL/src/Mapping.cpp             |   34 +
 offload/DeviceRTL/src/Parallelism.cpp         |   22 +-
 offload/DeviceRTL/src/Reduction.cpp           |   48 +
 offload/DeviceRTL/src/State.cpp               |    7 +-
 offload/DeviceRTL/src/Synchronization.cpp     |    4 +
 offload/DeviceRTL/src/Workshare.cpp           |   44 +
 19 files changed, 3133 insertions(+), 281 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e39c7c58d2780e..b515f7761ebc71 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1033,6 +1033,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
     : CGM(CGM), OMPBuilder(CGM.getModule()) {
+
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
   llvm::OpenMPIRBuilderConfig Config(
       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
@@ -1054,6 +1055,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
 }
 
 void CGOpenMPRuntime::clear() {
+
   InternalVars.clear();
   // Clean non-target variable declarations possibly used only in debug info.
   for (const auto &Data : EmittedNonTargetVariables) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87496c8e488c67..bef637907b0ed4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -262,6 +262,7 @@ class CheckVarsEscapingDeclContext final
                                bool IsCombinedParallelRegion) {
     if (!S)
       return;
+
     for (const CapturedStmt::Capture &C : S->captures()) {
       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
         const ValueDecl *VD = C.getCapturedVar();
@@ -336,13 +337,15 @@ class CheckVarsEscapingDeclContext final
       return;
     if (!D->hasAssociatedStmt())
       return;
+
     if (const auto *S =
             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
       // Do not analyze directives that do not actually require capturing,
       // like `omp for` or `omp simd` directives.
       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
-      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
+      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown &&
+          D->getDirectiveKind() != OMPD_simd) {
         VisitStmt(S->getCapturedStmt());
         return;
       }
@@ -502,23 +505,23 @@ class CheckVarsEscapingDeclContext final
 } // anonymous namespace
 
 /// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
+///// We assume that the warp size is 32, which is always the case
+///// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+    llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 }
-
+               
 /// Get the id of the current lane in the Warp.
 /// We assume that the warp size is 32, which is always the case
 /// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+    llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
   assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
   unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
@@ -2812,9 +2815,11 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-#ifndef NDEBUG
+  bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
+  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
+
+  ASTContext &C = CGM.getContext();
 
   if (Options.SimpleReduction) {
     assert(!TeamsReduction && !ParallelReduction &&
@@ -2824,155 +2829,78 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
   }
 
-  assert((TeamsReduction || ParallelReduction) &&
-         "Invalid reduction selection in emitReduction.");
-
-  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
-  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
-  int Cnt = 0;
-  for (const Expr *DRE : Privates) {
-    PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
-    ++Cnt;
-  }
-
-  ASTContext &C = CGM.getContext();
-  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
-      CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
-
-  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
-  // RedList, shuffle_reduce_func, interwarp_copy_func);
-  // or
-  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  // Source location for theident struct
   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
 
-  llvm::Value *Res;
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  auto Size = RHSExprs.size();
-  for (const Expr *E : Privates) {
-    if (E->getType()->isVariablyModifiedType())
-      // Reserve place for array size.
-      ++Size;
-  }
-  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy = C.getConstantArrayType(
-      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
-      /*IndexTypeQuals=*/0);
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    CGF.Builder.CreateStore(
-        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
-        Elem);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
-  llvm::Function *ReductionFn = emitReductionFunction(
-      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
-      Privates, LHSExprs, RHSExprs, ReductionOps);
-  llvm::Value *ReductionDataSize =
-      CGF.getTypeSize(C.getRecordType(ReductionRec));
-  ReductionDataSize =
-      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
-  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
-      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
-  llvm::Value *InterWarpCopyFn =
-      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
-  if (ParallelReduction) {
-    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
-                           InterWarpCopyFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
-        Args);
-  } else {
-    assert(TeamsReduction && "expected teams reduction.");
-    TeamsReductions.push_back(ReductionRec);
-    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
-        {}, "_openmp_teams_reductions_buffer_$_$ptr");
-    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-
-    llvm::Value *Args[] = {
-        RTLoc,
-        KernelTeamsReductionPtr,
-        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
-        ReductionDataSize,
-        RL,
-        ShuffleAndReduceFn,
-        InterWarpCopyFn,
-        GlobalToBufferCpyFn,
-        GlobalToBufferRedFn,
-        BufferToGlobalCpyFn,
-        BufferToGlobalRedFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
-        Args);
-  }
-
-  // 5. Build if (res == 1)
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
-  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
-      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
-  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
-  // 6. Build then branch: where we have reduced values in the master
-  //    thread in each team.
-  //    __kmpc_end_reduce{_nowait}(<gtid>);
-  //    break;
-  CGF.EmitBlock(ThenBB);
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+                         CGF.AllocaInsertPt->getIterator());
+  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+                          CGF.Builder.GetInsertPoint());
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
-  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
-  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
-                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
-      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
-                                  cast<DeclRefExpr>(*IRHS));
-      ++IPriv;
-      ++ILHS;
-      ++IRHS;
+  CodeGenFunction::OMPPrivateScope Scope(CGF);
+  unsigned Idx = 0;
+  for (const Expr *Private : Privates) {
+    llvm::Type *ElementType;
+    llvm::Value *Variable;
+    llvm::Value *PrivateVariable;
+    llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+    ElementType = CGF.ConvertTypeForMem(Private->getType());
+    const auto *RHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+    PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getBasePointer();
+    const auto *LHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+    Variable = CGF.GetAddrOfLocalVar(LHSVar).getBasePointer();
+    llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+    switch (CGF.getEvaluationKind(Private->getType())) {
+    case TEK_Scalar:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+      break;
+    case TEK_Complex:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+      break;
+    case TEK_Aggregate:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+      break;
     }
-  };
-  RegionCodeGenTy RCG(CodeGen);
-  RCG(CGF);
-  // There is no need to emit line number for unconditional branch.
-  (void)ApplyDebugLocation::CreateEmpty(CGF);
-  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+    auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+                            llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+                            llvm::Function *NewFunc) {
+      CGF.Builder.restoreIP(CodeGenIP);
+      auto *CurFn = CGF.CurFn;
+      CGF.CurFn = NewFunc;
+
+      *LHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+                    .getBasePointer();
+      *RHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+                    .getBasePointer();
+
+      emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+                                  cast<DeclRefExpr>(LHSExprs[I]),
+                                  cast<DeclRefExpr>(RHSExprs[I]));
+
+      CGF.CurFn = CurFn;
+
+      return InsertPointTy(CGF.Builder.GetInsertBlock(),
+                           CGF.Builder.GetInsertPoint());
+    };
+    ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+        ElementType, Variable, PrivateVariable, EvalKind,
+        /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+    Idx++;
+  }
+
+  CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+      OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+      DistributeReduction, SimdReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+      CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+      RTLoc));
+  return;
 }
 
 const VarDecl *
@@ -3574,3 +3502,4 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
                                  CGM.getModule(), OMPRTL___kmpc_get_warp_size),
                              Args);
 }
+
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ef3aa3a8e0dc61..369e91d33de208 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1402,6 +1402,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     }
 
     const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
+llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n";
     EmitVarDecl(*VD);
     EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD),
                       /*Volatile=*/false, TaskRedRef->getType());
@@ -1442,7 +1443,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
                       isOpenMPParallelDirective(D.getDirectiveKind()) ||
                       TeamsLoopCanBeParallel || ReductionKind == OMPD_simd;
-    bool SimpleReduction = ReductionKind == OMPD_simd;
+    bool SimpleReduction = (CGM.getLangOpts().OpenMPIsTargetDevice ? false : ReductionKind == OMPD_simd);
     // Emit nowait reduction if nowait clause is present or directive is a
     // parallel directive (it always has implicit barrier).
     CGM.getOpenMPRuntime().emitReduction(
@@ -2673,61 +2674,139 @@ GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
 }
 
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
-  bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
-  if (UseOMPIRBuilder) {
-    auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF,
-                                                          PrePostActionTy &) {
-      // Use the OpenMPIRBuilder if enabled.
-      if (UseOMPIRBuilder) {
-        llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars =
-            GetAlignedMapping(S, CGF);
-        // Emit the associated statement and get its loop representation.
-        const Stmt *Inner = S.getRawStmt();
-        llvm::CanonicalLoopInfo *CLI =
-            EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
-
-        llvm::OpenMPIRBuilder &OMPBuilder =
-            CGM.getOpenMPRuntime().getOMPBuilder();
-        // Add SIMD specific metadata
-        llvm::ConstantInt *Simdlen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Simdlen = Val;
-        }
-        llvm::ConstantInt *Safelen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Safelen = Val;
-        }
-        llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
-        if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
-          if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) {
-            Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
-          }
-        }
-        // Add simd metadata to the collapsed loop. Do not generate
-        // another loop for if clause. Support for if clause is done earlier.
-        OMPBuilder.applySimd(CLI, AlignedVars,
-                             /*IfCond*/ nullptr, Order, Simdlen, Safelen);
-        return;
-      }
+  bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsTargetDevice;
+  if(UseOMPIRBuilder) {
+    auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt());
+    auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt());
+    CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
+
+    CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+    llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+      AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+
+    llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+
+    using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+    // Callback function for generating the trip count of the loop.
+    // This function should assign values to the TripCount and Signed variables
+    llvm::Value *LoopVar;
+    std::string LoopVarName;
+    EmittedClosureTy LoopVarClosure;
+
+    auto DistanceCB = [&](llvm::BasicBlock *AllocaBB,
+                          InsertPointTy CodeGenIP) -> llvm::Value* {
+      InsertPointTy AllocaIP(AllocaBB, AllocaBB->getTerminator()->getIterator());
+      OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+        *this, AllocaIP, *(CodeGenIP.getBlock()));
+      Builder.restoreIP(CodeGenIP);
+
+      // Emit the loop variable, needed for the distance func
+      const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt());
+      if(const Stmt *InitStmt = For->getInit())
+        EmitStmt(InitStmt);
+
+      auto *LoopVarRef = CL->getLoopVarRef();
+      LValue LCVal = EmitLValue(LoopVarRef);
+      //Address LoopVarAddress = LCVal.getAddress(*this);
+      //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer());
+      LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this));
+      LoopVarName = LoopVarRef->getNameInfo().getAsString();
+
+      // Emit the distance func from the CanonicalLoop
+      const CapturedStmt *DistanceFunc = CL->getDistanceFunc();
+      EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+
+      // Load the output and store it in the TripCount
+      QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+
+      //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+      RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+ 
+      emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+      auto *TripCount = Builder.CreateLoad(CountAddr, ".count");
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+      LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+      return TripCount;
     };
-    {
-      auto LPCRegion =
-          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-      OMPLexicalScope Scope(*this, S, OMPD_unknown);
-      CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd,
-                                                  CodeGenIRBuilder);
-    }
+
+    auto FiniCB = [this](InsertPointTy IP) {
+      OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
+    };
+
+    auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                     llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) {
+      ReplVal = &Val;
+      return CodeGenIP;
+    };
+
+    auto BodyGenCB = [&]
+                     (//InsertPointTy OuterAllocaIP,
+                      llvm::BasicBlock *OuterAllocaBB,
+                      InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      InsertPointTy Prolog, InsertPointTy ReductionEpilog,
+                      llvm::Value *Virtual) {
+
+      llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+      InsertPointTy OuterAllocaIP(OuterAllocaBB, OuterAllocaBB->getTerminator()->getIterator());
+
+      OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+        *this, OuterAllocaIP, *(Prolog.getBlock()));
+      Builder.restoreIP(Prolog);
+
+      OMPPrivateScope PrivateScope(*this);
+      EmitOMPFirstprivateClause(S, PrivateScope);
+      EmitOMPPrivateClause(S, PrivateScope);
+      EmitOMPReductionClauseInit(S, PrivateScope);
+      PrivateScope.Privatize();
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+
+      Builder.restoreIP(CodeGenIP);
+      emitCapturedStmtCall(*this, LoopVarClosure,
+                           {LoopVar, Virtual});
+
+      // Generate the body of the loop
+      OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+          *this,
+          S.getBody(),
+          AllocaIP,
+          CodeGenIP,
+          "simd");
+
+       llvm::BasicBlock *RedEpilogBB = ReductionEpilog.getBlock();
+       llvm::Instruction *RedEpilogTerminator = RedEpilogBB->getTerminator();
+       llvm::BasicBlock *FinalBlock = RedEpilogBB->getSingleSuccessor();
+
+       Builder.restoreIP(ReductionEpilog);
+       EmitOMPReductionClauseFinal(S, OMPD_simd);
+
+       llvm::BasicBlock *ReductionThenBB = Builder.GetInsertBlock();
+
+       if(!(ReductionThenBB->getTerminator())) {
+         RedEpilogTerminator->eraseFromParent();
+         Builder.CreateBr(FinalBlock);
+       }
+
+    };
+
+    Builder.restoreIP(
+      OMPBuilder.createSimdLoop(
+        Builder,
+        AllocaIP,
+        BodyGenCB,
+        DistanceCB,
+        PrivCB,
+        FiniCB
+    ));
+
     return;
-  }
+  } 
 
   ParentLoopDirectiveForScanRegion ScanRegion(*this, S);
   OMPFirstScanLoop = true;
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 87766a758311d5..851a93af8bb83c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -102,7 +102,7 @@ CodeGenFunction::~CodeGenFunction() {
   // seems to be a reasonable spot. We do it here, as opposed to the deletion
   // time of the CodeGenModule, because we have to ensure the IR has not yet
   // been "emitted" to the outside, thus, modifications are still sensible.
-  if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+  if ((CGM.getLangOpts().OpenMPIsTargetDevice || CGM.getLangOpts().OpenMPIRBuilder) && CurFn)
     CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn);
 }
 
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 53d89ce2fa3e99..9bd78fa9351c88 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2974,10 +2974,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
         Sema::CompoundScopeRAII Scope(Actions);
         AssociatedStmt = ParseStatement();
 
-        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
-            getLangOpts().OpenMPIRBuilder)
-          AssociatedStmt =
-              Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
+        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind))
+          AssociatedStmt = Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
       }
       AssociatedStmt =
           Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 3f752ac663f412..4194bdec549dd4 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -85,6 +85,7 @@ int foo() {
 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
@@ -102,7 +103,6 @@ int foo() {
 // IR-GPU-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
@@ -258,6 +258,7 @@ int foo() {
 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
@@ -275,7 +276,6 @@ int foo() {
 // IR-GPU-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
 // IR-GPU-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
@@ -399,12 +399,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -480,10 +480,10 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -544,12 +544,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -625,10 +625,10 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index c9ee0c25194c23..870e0edad19c87 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Allocator.h"
@@ -99,7 +100,10 @@ class OpenMPIRBuilderConfig {
   /// expanded.
   std::optional<bool> IsGPU;
 
-  // Flag for specifying if offloading is mandatory.
+  /// Flag for specifying if LLVMUsed information should be emitted.
+  std::optional<bool> EmitLLVMUsed;
+
+  /// Flag for specifying if offloading is mandatory.
   std::optional<bool> OpenMPOffloadMandatory;
 
   /// First separator used between the initial two parts of a name.
@@ -107,6 +111,9 @@ class OpenMPIRBuilderConfig {
   /// Separator used between all of the rest consecutive parts of s name
   std::optional<StringRef> Separator;
 
+  // Grid Value for the GPU target
+  std::optional<omp::GV> GridValue;
+
   OpenMPIRBuilderConfig();
   OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
                         bool OpenMPOffloadMandatory,
@@ -132,6 +139,11 @@ class OpenMPIRBuilderConfig {
     return *OpenMPOffloadMandatory;
   }
 
+  omp::GV getGridValue() const {
+    assert(GridValue.has_value() && "GridValue is not set");
+    return *GridValue;
+  }
+
   bool hasRequiresFlags() const { return RequiresFlags; }
   bool hasRequiresReverseOffload() const;
   bool hasRequiresUnifiedAddress() const;
@@ -164,9 +176,11 @@ class OpenMPIRBuilderConfig {
 
   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
   void setIsGPU(bool Value) { IsGPU = Value; }
+  void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsed = Value; }
   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
   void setSeparator(StringRef S) { Separator = S; }
+  void setGridValue(omp::GV G) { GridValue = G; }
 
   void setHasRequiresReverseOffload(bool Value);
   void setHasRequiresUnifiedAddress(bool Value);
@@ -262,7 +276,7 @@ class OffloadEntriesInfoManager {
   /// Return true if a there are no entries defined.
   bool empty() const;
   /// Return number of entries defined so far.
-  unsigned size() const { return OffloadingEntriesNum; }
+  unsigned size() const { return OffloadingEntriesNum /*OffloadEntriesTargetRegion.size()*/ /*OffloadingEntriesNum*/; }
 
   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
 
@@ -470,6 +484,11 @@ class OpenMPIRBuilder {
   ///                              all functions are finalized.
   void finalize(Function *Fn = nullptr);
 
+  CallInst *globalizeAlloca(AllocaInst *Alloca, SmallVector<Instruction*, 32>&);
+  void globalizeParallelVars(Function *CurFn);
+  SmallPtrSet<Value*, 32> VarsNeedingGlobalization;
+  void globalizeVars(Function *CurFn);
+
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
 
@@ -548,6 +567,18 @@ class OpenMPIRBuilder {
   using BodyGenCallbackTy =
       function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
+  using LoopBodyCallbackTy =
+      function_ref<void(
+        BasicBlock *OuterAllocaBB, InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+        InsertPointTy PrologIP, InsertPointTy ReductionEpilogIP,
+        Value *IterationNum
+      )>;
+
+  using TripCountCallbackTy =
+      function_ref<
+        Value*(llvm::BasicBlock *AllocaBB, InsertPointTy CodeGenIP)
+      >;
+
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
   // function_ref class - function_ref contains non-ownable reference
@@ -607,15 +638,17 @@ class OpenMPIRBuilder {
   /// Generator for '#omp barrier'
   ///
   /// \param Loc The location where the barrier directive was encountered.
-  /// \param DK The kind of directive that caused the barrier.
+  /// \param Kind The kind of directive that caused the barrier.
   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
   ///                        should be checked and acted upon.
+  /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
   ///
   /// \returns The insertion point after the barrier.
   InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
                               bool ForceSimpleCall = false,
-                              bool CheckCancelFlag = true);
+                              bool CheckCancelFlag = true,
+                              Value *ThreadID = nullptr);
 
   /// Generator for '#omp cancel'
   ///
@@ -627,6 +660,13 @@ class OpenMPIRBuilder {
   InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
                              omp::Directive CanceledDirective);
 
+  IRBuilder<>::InsertPoint
+  createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP,
+                 LoopBodyCallbackTy BodyGenCB,
+                 TripCountCallbackTy DistanceCB,
+                 PrivatizeCallbackTy PrivCB,
+                 FinalizeCallbackTy FiniCB);
+
   /// Generator for '#omp parallel'
   ///
   /// \param Loc The insert and source location description.
@@ -1235,27 +1275,56 @@ class OpenMPIRBuilder {
   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                            StringRef ParentName = "");
 
-  /// Functions used to generate reductions. Such functions take two Values
-  /// representing LHS and RHS of the reduction, respectively, and a reference
-  /// to the value that is updated to refer to the reduction result.
-  using ReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+  /// Enum class for the RedctionGen CallBack type to be used.
+  enum class ReductionGenCBTy { Clang, MLIR };
+
+  /// ReductionGen CallBack for Clang
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param Index Index of the ReductionInfo to generate code for.
+  /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
+  /// codegen, used for fixup later.
+  /// \param RHSPtr Optionally used by Clang to
+  /// return the RHSPtr it used for codegen, used for fixup later.
+  /// \param CurFn Optionally used by Clang to pass in the Current Function as
+  /// Clang context may be old.
+  using ReductionGenCBClang =
+      std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
+                                  Value **LHS, Value **RHS, Function *CurFn)>;
+
+  /// ReductionGen CallBack for MLIR
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param LHS Pass in the LHS Value to be used for CodeGen.
+  /// \param RHS Pass in the RHS Value to be used for CodeGen.
+  using ReductionGenCB = std::function<InsertPointTy(
+      InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
 
   /// Functions used to generate atomic reductions. Such functions take two
   /// Values representing pointers to LHS and RHS of the reduction, as well as
   /// the element type of these pointers. They are expected to atomically
   /// update the LHS to the reduced value.
-  using AtomicReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+  using AtomicReductionGenCB =
+      std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+  /// Enum class for reduction evaluation types scalar, complex and aggregate.
+  enum class EvaluationKindTy { Scalar, Complex, Aggregate };
 
   /// Information about an OpenMP reduction.
   struct ReductionInfo {
     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
-                  ReductionGenTy ReductionGen,
-                  AtomicReductionGenTy AtomicReductionGen)
+                  EvaluationKindTy EvaluationKind, ReductionGenCB ReductionGen,
+                  ReductionGenCBClang ReductionGenClang,
+                  AtomicReductionGenCB AtomicReductionGen)
         : ElementType(ElementType), Variable(Variable),
-          PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
+          PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
+          ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
           AtomicReductionGen(AtomicReductionGen) {}
+    ReductionInfo(Value *PrivateVariable)
+        : ElementType(nullptr), Variable(nullptr),
+          PrivateVariable(PrivateVariable),
+          EvaluationKind(EvaluationKindTy::Scalar), ReductionGen(),
+          ReductionGenClang(), AtomicReductionGen() {}
 
     /// Reduction element type, must match pointee type of variable.
     Type *ElementType;
@@ -1266,18 +1335,543 @@ class OpenMPIRBuilder {
     /// Thread-private partial reduction variable.
     Value *PrivateVariable;
 
+    /// Reduction evaluation type - scalar, complex or aggregate.
+    EvaluationKindTy EvaluationKind;
+
     /// Callback for generating the reduction body. The IR produced by this will
     /// be used to combine two values in a thread-safe context, e.g., under
     /// lock or within the same thread, and therefore need not be atomic.
-    ReductionGenTy ReductionGen;
+    ReductionGenCB ReductionGen;
+
+    /// Clang callback for generating the reduction body. The IR produced by
+    /// this will be used to combine two values in a thread-safe context, e.g.,
+    /// under lock or within the same thread, and therefore need not be atomic.
+    ReductionGenCBClang ReductionGenClang;
 
     /// Callback for generating the atomic reduction body, may be null. The IR
     /// produced by this will be used to atomically combine two values during
     /// reduction. If null, the implementation will use the non-atomic version
     /// along with the appropriate synchronization mechanisms.
-    AtomicReductionGenTy AtomicReductionGen;
+    AtomicReductionGenCB AtomicReductionGen;
   };
 
+  enum class CopyAction : unsigned {
+    // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+    // the warp using shuffle instructions.
+    RemoteLaneToThread,
+    // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+    ThreadCopy,
+  };
+
+  struct CopyOptionsTy {
+    Value *RemoteLaneOffset = nullptr;
+    Value *ScratchpadIndex = nullptr;
+    Value *ScratchpadWidth = nullptr;
+  };
+
+  /// Supporting functions for Reductions CodeGen.
+private:
+  /// Emit the llvm.used metadata.
+  void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
+
+  /// Get the id of the current thread on the GPU.
+  Value *getGPUThreadID();
+
+  /// Get the GPU warp size.
+  Value *getGPUWarpSize();
+
+  /// Get the id of the warp in the block.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXWarpID();
+
+  /// Get the id of the current lane in the Warp.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXLaneID();
+
+  /// Cast value to the specified type.
+  Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
+
+  /// This function creates calls to one of two shuffle functions to copy
+  /// variables between lanes in a warp.
+  Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
+                                      Type *ElementType, Value *Offset);
+
+  void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
+                       Type *ElementType, Value *Offset,
+                       Type *ReductionArrayTy);
+
+  /// Emit instructions to copy a Reduce list, which contains partially
+  /// aggregated values, in the specified direction.
+  void emitReductionListCopy(
+      InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+      ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
+
+  /// Emit a helper that reduces data across two OpenMP threads (lanes)
+  /// in the same warp.  It uses shuffle instructions to copy over data from
+  /// a remote lane's stack.  The reduction algorithm performed is specified
+  /// by the fourth parameter.
+  ///
+  /// Algorithm Versions.
+  /// Full Warp Reduce (argument value 0):
+  ///   This algorithm assumes that all 32 lanes are active and gathers
+  ///   data from these 32 lanes, producing a single resultant value.
+  /// Contiguous Partial Warp Reduce (argument value 1):
+  ///   This algorithm assumes that only a *contiguous* subset of lanes
+  ///   are active.  This happens for the last warp in a parallel region
+  ///   when the user specified num_threads is not an integer multiple of
+  ///   32.  This contiguous subset always starts with the zeroth lane.
+  /// Partial Warp Reduce (argument value 2):
+  ///   This algorithm gathers data from any number of lanes at any position.
+  /// All reduced values are stored in the lowest possible lane.  The set
+  /// of problems every algorithm addresses is a super set of those
+  /// addressable by algorithms with a lower version number.  Overhead
+  /// increases as algorithm version increases.
+  ///
+  /// Terminology
+  /// Reduce element:
+  ///   Reduce element refers to the individual data field with primitive
+  ///   data types to be combined and reduced across threads.
+  /// Reduce list:
+  ///   Reduce list refers to a collection of local, thread-private
+  ///   reduce elements.
+  /// Remote Reduce list:
+  ///   Remote Reduce list refers to a collection of remote (relative to
+  ///   the current thread) reduce elements.
+  ///
+  /// We distinguish between three states of threads that are important to
+  /// the implementation of this function.
+  /// Alive threads:
+  ///   Threads in a warp executing the SIMT instruction, as distinguished from
+  ///   threads that are inactive due to divergent control flow.
+  /// Active threads:
+  ///   The minimal set of threads that has to be alive upon entry to this
+  ///   function.  The computation is correct iff active threads are alive.
+  ///   Some threads are alive but they are not active because they do not
+  ///   contribute to the computation in any useful manner.  Turning them off
+  ///   may introduce control flow overheads without any tangible benefits.
+  /// Effective threads:
+  ///   In order to comply with the argument requirements of the shuffle
+  ///   function, we must keep all lanes holding data alive.  But at most
+  ///   half of them perform value aggregation; we refer to this half of
+  ///   threads as effective. The other half is simply handing off their
+  ///   data.
+  ///
+  /// Procedure
+  /// Value shuffle:
+  ///   In this step active threads transfer data from higher lane positions
+  ///   in the warp to lower lane positions, creating Remote Reduce list.
+  /// Value aggregation:
+  ///   In this step, effective threads combine their thread local Reduce list
+  ///   with Remote Reduce list and store the result in the thread local
+  ///   Reduce list.
+  /// Value copy:
+  ///   In this step, we deal with the assumption made by algorithm 2
+  ///   (i.e. contiguity assumption).  When we have an odd number of lanes
+  ///   active, say 2k+1, only k threads will be effective and therefore k
+  ///   new values will be produced.  However, the Reduce list owned by the
+  ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
+  ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
+  ///   that the contiguity assumption still holds.
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReduceFn The reduction function.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ShuffleAndReduce function.
+  Function *emitShuffleAndReduceFunction(
+      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+      Function *ReduceFn, AttributeList FuncAttrs);
+
+  /// This function emits a helper that gathers Reduce lists from the first
+  /// lane of every active warp to lanes in the first warp.
+  ///
+  /// void inter_warp_copy_func(void* reduce_data, num_warps)
+  ///   shared smem[warp_size];
+  ///   For all data entries D in reduce_data:
+  ///     sync
+  ///     If (I am the first lane in each warp)
+  ///       Copy my local D to smem[warp_id]
+  ///     sync
+  ///     if (I am the first warp)
+  ///       Copy smem[thread_id] to my local D
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The InterWarpCopy function.
+  Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
+                                      ArrayRef<ReductionInfo> ReductionInfos,
+                                      AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy local D to buffer.D[Idx]
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalCopy function.
+  Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy buffer.D[Idx] to local D;
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToList function.
+  Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(GlobPtrs, reduce_data);
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalReduce function.
+  Function *
+  emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(reduce_data, GlobPtrs);
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToListReduce function.
+  Function *
+  emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// Get the function name of a reduction function.
+  std::string getReductionFuncName(StringRef Name) const;
+
+  /// Emits reduction function.
+  /// \param ReducerName Name of the function calling the reduction.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param IsGpu Optional param to specify CodeGen for GPU Offloading.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The reduction function.
+  Function *createReductionFunction(
+      StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+      ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+      AttributeList FuncAttrs = {});
+
+public:
+  ///
+  /// Design of OpenMP reductions on the GPU
+  ///
+  /// Consider a typical OpenMP program with one or more reduction
+  /// clauses:
+  ///
+  /// float foo;
+  /// double bar;
+  /// #pragma omp target teams distribute parallel for \
+  ///             reduction(+:foo) reduction(*:bar)
+  /// for (int i = 0; i < N; i++) {
+  ///   foo += A[i]; bar *= B[i];
+  /// }
+  ///
+  /// where 'foo' and 'bar' are reduced across all OpenMP threads in
+  /// all teams.  In our OpenMP implementation on the NVPTX device an
+  /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
+  /// within a team are mapped to CUDA threads within a threadblock.
+  /// Our goal is to efficiently aggregate values across all OpenMP
+  /// threads such that:
+  ///
+  ///   - the compiler and runtime are logically concise, and
+  ///   - the reduction is performed efficiently in a hierarchical
+  ///     manner as follows: within OpenMP threads in the same warp,
+  ///     across warps in a threadblock, and finally across teams on
+  ///     the NVPTX device.
+  ///
+  /// Introduction to Decoupling
+  ///
+  /// We would like to decouple the compiler and the runtime so that the
+  /// latter is ignorant of the reduction variables (number, data types)
+  /// and the reduction operators.  This allows a simpler interface
+  /// and implementation while still attaining good performance.
+  ///
+  /// Pseudocode for the aforementioned OpenMP program generated by the
+  /// compiler is as follows:
+  ///
+  /// 1. Create private copies of reduction variables on each OpenMP
+  ///    thread: 'foo_private', 'bar_private'
+  /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
+  ///    to it and writes the result in 'foo_private' and 'bar_private'
+  ///    respectively.
+  /// 3. Call the OpenMP runtime on the GPU to reduce within a team
+  ///    and store the result on the team master:
+  ///
+  ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
+  ///        reduceData, shuffleReduceFn, interWarpCpyFn)
+  ///
+  ///     where:
+  ///       struct ReduceData {
+  ///         double *foo;
+  ///         double *bar;
+  ///       } reduceData
+  ///       reduceData.foo = &foo_private
+  ///       reduceData.bar = &bar_private
+  ///
+  ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
+  ///     auxiliary functions generated by the compiler that operate on
+  ///     variables of type 'ReduceData'.  They aid the runtime perform
+  ///     algorithmic steps in a data agnostic manner.
+  ///
+  ///     'shuffleReduceFn' is a pointer to a function that reduces data
+  ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
+  ///     same warp.  It takes the following arguments as input:
+  ///
+  ///     a. variable of type 'ReduceData' on the calling lane,
+  ///     b. its lane_id,
+  ///     c. an offset relative to the current lane_id to generate a
+  ///        remote_lane_id.  The remote lane contains the second
+  ///        variable of type 'ReduceData' that is to be reduced.
+  ///     d. an algorithm version parameter determining which reduction
+  ///        algorithm to use.
+  ///
+  ///     'shuffleReduceFn' retrieves data from the remote lane using
+  ///     efficient GPU shuffle intrinsics and reduces, using the
+  ///     algorithm specified by the 4th parameter, the two operands
+  ///     element-wise.  The result is written to the first operand.
+  ///
+  ///     Different reduction algorithms are implemented in different
+  ///     runtime functions, all calling 'shuffleReduceFn' to perform
+  ///     the essential reduction step.  Therefore, based on the 4th
+  ///     parameter, this function behaves slightly differently to
+  ///     cooperate with the runtime to ensure correctness under
+  ///     different circumstances.
+  ///
+  ///     'InterWarpCpyFn' is a pointer to a function that transfers
+  ///     reduced variables across warps.  It tunnels, through CUDA
+  ///     shared memory, the thread-private data of type 'ReduceData'
+  ///     from lane 0 of each warp to a lane in the first warp.
+  /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+  ///    The last team writes the global reduced value to memory.
+  ///
+  ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
+  ///             reduceData, shuffleReduceFn, interWarpCpyFn,
+  ///             scratchpadCopyFn, loadAndReduceFn)
+  ///
+  ///     'scratchpadCopyFn' is a helper that stores reduced
+  ///     data from the team master to a scratchpad array in
+  ///     global memory.
+  ///
+  ///     'loadAndReduceFn' is a helper that loads data from
+  ///     the scratchpad array and reduces it with the input
+  ///     operand.
+  ///
+  ///     These compiler generated functions hide address
+  ///     calculation and alignment information from the runtime.
+  /// 5. if ret == 1:
+  ///     The team master of the last team stores the reduced
+  ///     result to the globals in memory.
+  ///     foo += reduceData.foo; bar *= reduceData.bar
+  ///
+  ///
+  /// Warp Reduction Algorithms
+  ///
+  /// On the warp level, we have three algorithms implemented in the
+  /// OpenMP runtime depending on the number of active lanes:
+  ///
+  /// Full Warp Reduction
+  ///
+  /// The reduce algorithm within a warp where all lanes are active
+  /// is implemented in the runtime as follows:
+  ///
+  /// full_warp_reduce(void *reduce_data,
+  ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+  ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
+  /// }
+  ///
+  /// The algorithm completes in log(2, WARPSIZE) steps.
+  ///
+  /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
+  /// not used therefore we save instructions by not retrieving lane_id
+  /// from the corresponding special registers.  The 4th parameter, which
+  /// represents the version of the algorithm being used, is set to 0 to
+  /// signify full warp reduction.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// #reduce_elem refers to an element in the local lane's data structure
+  /// #remote_elem is retrieved from a remote lane
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
+  ///
+  /// Contiguous Partial Warp Reduction
+  ///
+  /// This reduce algorithm is used within a warp where only the first
+  /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
+  /// number of OpenMP threads in a parallel region is not a multiple of
+  /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
+  ///
+  /// void
+  /// contiguous_partial_reduce(void *reduce_data,
+  ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
+  ///                           int size, int lane_id) {
+  ///   int curr_size;
+  ///   int offset;
+  ///   curr_size = size;
+  ///   mask = curr_size/2;
+  ///   while (offset>0) {
+  ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+  ///     curr_size = (curr_size+1)/2;
+  ///     offset = curr_size/2;
+  ///   }
+  /// }
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id < offset)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  /// This algorithm assumes that the data to be reduced are located in a
+  /// contiguous subset of lanes starting from the first.  When there is
+  /// an odd number of active lanes, the data in the last lane is not
+  /// aggregated with any other lane's dat but is instead copied over.
+  ///
+  /// Dispersed Partial Warp Reduction
+  ///
+  /// This algorithm is used within a warp when any discontiguous subset of
+  /// lanes are active.  It is used to implement the reduction operation
+  /// across lanes in an OpenMP simd region or in a nested parallel region.
+  ///
+  /// void
+  /// dispersed_partial_reduce(void *reduce_data,
+  ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   int size, remote_id;
+  ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
+  ///   do {
+  ///       remote_id = next_active_lane_id_right_after_me();
+  ///       # the above function returns 0 of no active lane
+  ///       # is present right after the current lane.
+  ///       size = number_of_active_lanes_in_this_warp();
+  ///       logical_lane_id /= 2;
+  ///       ShuffleReduceFn(reduce_data, logical_lane_id,
+  ///                       remote_id-1-threadIdx.x, 2);
+  ///   } while (logical_lane_id % 2 == 0 && size > 1);
+  /// }
+  ///
+  /// There is no assumption made about the initial state of the reduction.
+  /// Any number of lanes (>=1) could be active at any position.  The reduction
+  /// result is returned in the first active lane.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id % 2 == 0 && offset > 0)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  ///
+  /// Intra-Team Reduction
+  ///
+  /// This function, as implemented in the runtime call
+  /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
+  /// threads in a team.  It first reduces within a warp using the
+  /// aforementioned algorithms.  We then proceed to gather all such
+  /// reduced values at the first warp.
+  ///
+  /// The runtime makes use of the function 'InterWarpCpyFn', which copies
+  /// data from each of the "warp master" (zeroth lane of each warp, where
+  /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
+  /// a mathematical sense) the problem of reduction across warp masters in
+  /// a block to the problem of warp reduction.
+  ///
+  ///
+  /// Inter-Team Reduction
+  ///
+  /// Once a team has reduced its data to a single value, it is stored in
+  /// a global scratchpad array.  Since each team has a distinct slot, this
+  /// can be done without locking.
+  ///
+  /// The last team to write to the scratchpad array proceeds to reduce the
+  /// scratchpad array.  One or more workers in the last team use the helper
+  /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+  /// the k'th worker reduces every k'th element.
+  ///
+  /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
+  /// reduce across workers and compute a globally reduced value.
+  ///
+  /// \param Loc                The location where the reduction was
+  ///                           encountered. Must be within the associate
+  ///                           directive and after the last local access to the
+  ///                           reduction variables.
+  /// \param AllocaIP           An insertion point suitable for allocas usable
+  ///                           in reductions.
+  /// \param AllocaIP           An insertion point suitable for code generation.
+  /// \param ReductionInfos     A list of info on each reduction variable.
+  /// \param IsNoWait           Optional flag set if the reduction is marked as
+  ///                           nowait.
+  /// \param IsTeamsReduction   Optional flag set if it is a teams
+  ///                           reduction.
+  /// \param HasDistribute      Optional flag set if it is a
+  ///                           distribute reduction.
+  /// \param GridValue          Optional GPU grid value.
+  /// \param GridValue          Optional GPU grid value.
+  /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
+  /// used for teams reduction.
+  InsertPointTy createReductionsGPU(
+      const LocationDescription &Loc, InsertPointTy AllocaIP,
+      InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+      bool IsNoWait = false, bool IsTeamsReduction = false,
+      bool HasDistribute = false,
+      bool IsSimdReduction = false,
+      ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+      std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
+      Value *SrcLocInfo = nullptr);
+
   // TODO: provide atomic and non-atomic reduction generators for reduction
   // operators defined by the OpenMP specification.
 
@@ -1421,19 +2015,6 @@ class OpenMPIRBuilder {
                                  Value *NumThreads, Value *HostPtr,
                                  ArrayRef<Value *> KernelArgs);
 
-  /// Generate a barrier runtime call.
-  ///
-  /// \param Loc The location at which the request originated and is fulfilled.
-  /// \param DK The directive which caused the barrier
-  /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
-  /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
-  ///                        should be checked and acted upon.
-  ///
-  /// \returns The insertion point after the barrier.
-  InsertPointTy emitBarrierImpl(const LocationDescription &Loc,
-                                omp::Directive DK, bool ForceSimpleCall,
-                                bool CheckCancelFlag);
-
   /// Generate a flush runtime call.
   ///
   /// \param Loc The location at which the request originated and is fulfilled.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index fe09bb8177c28e..116b7e98cdd873 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -124,6 +124,9 @@ __OMP_FUNCTION_TYPE(ShuffleReduce, false, Void, VoidPtr, Int16, Int16, Int16)
 __OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32)
 __OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr)
 
+__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr)
+__OMP_FUNCTION_TYPE(SimdTask, false, Void, VoidPtrPtr)
+
 #undef __OMP_FUNCTION_TYPE
 #undef OMP_FUNCTION_TYPE
 
@@ -204,6 +207,7 @@ __ICV_RT_GET(proc_bind, omp_get_proc_bind)
 
 
 __OMP_RTL(__kmpc_barrier, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_simd_barrier, false, Void, )
 __OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32)
 __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_error, false, Void, IdentPtr, Int32, Int8Ptr)
@@ -227,6 +231,7 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )
 
 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_simd_lane, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
@@ -480,6 +485,8 @@ __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
+__OMP_RTL(__kmpc_nvptx_simd_reduce_nowait_v2, false, Int32, IdentPtr,
+	  Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
 __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr,
 	  Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
 __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr,
@@ -502,6 +509,10 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__kmpc_simd_4u, false, Void, IdentPtr, LoopTaskPtr, Int32, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd_8u, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd, false, Void, IdentPtr, SimdTaskPtr, VoidPtrPtr, Int32)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
@@ -708,6 +719,7 @@ __OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, ZExt, Par
 __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
 
 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_simd_lane, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4d2d352f7520b2..77f3f863e1e358 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -145,6 +145,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
+Function *GLOBAL_ReductionFunc = nullptr;
+
 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   if (T.isAMDGPU()) {
     StringRef Features =
@@ -782,6 +784,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
   for (Function *F : ConstantAllocaRaiseCandidates)
     raiseUserConstantDataAllocasToEntryBlock(Builder, F);
 
+  //globalizeVars(Fn);
+
   EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
       [](EmitMetadataErrorKind Kind,
          const TargetRegionEntryInfo &EntryInfo) -> void {
@@ -790,8 +794,167 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
               "OMPIRBuilder finalization \n";
   };
 
-  if (!OffloadInfoManager.empty())
+  if (!OffloadInfoManager.empty()) 
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+  if (Config.EmitLLVMUsed) {
+    std::vector<WeakTrackingVH> LLVMCompilerUsed = {
+        M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
+    emitUsed("llvm.compiler.used", LLVMCompilerUsed);
+  }
+
+}
+
+CallInst * OpenMPIRBuilder::globalizeAlloca(
+  AllocaInst *Alloca,
+  SmallVector<Instruction*, 32> &ToBeDeleted
+) {
+  FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr(
+    OMPRTL___kmpc_alloc_shared
+  );
+
+  Builder.SetInsertPoint(Alloca);
+  Value *SharedAllocArgs[] = {
+    //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8)
+
+    //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout()));
+    //ConstantExpr::getSizeOf(Alloca->getAllocatedType())
+    ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue())
+  };
+
+  CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1));
+  AllocSharedCall->setName(Alloca->getName() + "_on_stack");
+  //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack");
+
+  dbgs() << "Created " << *AllocSharedCall << "\n";
+  dbgs() << *(Alloca->getType()) << "\n";
+  dbgs() << *(AllocSharedCall->getType()) << "\n";
+
+  //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0);
+  //dbgs() << " " << *CastType << "\n";
+  //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast(
+  //  AllocSharedCall, CastType, Alloca->getName()+"_on_stack"
+  //);
+
+  //dbgs() << " Casted " << *CastedSharedAlloc << "\n";
+
+  //Alloca->replaceAllUsesWith(AllocSharedCall);
+
+  // If the Alloca was allocated in address space 5 (local) we need to
+  // account for a type mismatch between it and the return from __kmpc_shared_alloc
+
+  for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) {
+    dbgs () << " User - " << *(*U) << "\n";
+  }
+
+  if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) {
+    auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back());
+    dbgs() << *(AddrSpaceCast->getType()) << "\n";
+    AddrSpaceCast->replaceAllUsesWith(AllocSharedCall);
+    //AddrSpaceCast->removeFromParent();
+    ToBeDeleted.push_back(AddrSpaceCast);
+  } else {
+    Alloca->replaceAllUsesWith(AllocSharedCall);
+  }
+  ToBeDeleted.push_back(Alloca);
+  //Alloca->removeFromParent();
+
+  //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) {
+  //  if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) {
+  //    if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) {
+  //      AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc);
+  //      AddrSpaceCast->removeFromParent();
+  //    }
+  //  }
+  //}
+
+  //Alloca->removeFromParent();
+
+  dbgs() << "  var globalized!\n";
+
+  return AllocSharedCall;
+
+}
+
+void OpenMPIRBuilder::globalizeParallelVars(
+  llvm::Function *CurFn
+) {
+  SmallVector<Instruction*, 32> ToBeDeleted;
+  std::stack<CallInst*> GlobalizedVars;
+
+  dbgs() << "  Exploring: " << CurFn->getName() << "\n";
+  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+  {
+    for(auto I = BB->begin(); I != BB->end(); I++)
+    {
+      if(auto Alloca = dyn_cast<AllocaInst>(I)) {
+        dbgs() << "    Found Alloca: " << *Alloca << "\n";
+        CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted);
+        GlobalizedVars.push(GlobalizedAlloca);
+      } else if(auto FnCall = dyn_cast<CallInst>(I)) {
+        dbgs() << "    Found Function Call: " << *FnCall << "\n";
+      }
+    }
+  }
+
+  BasicBlock &EndBlock = CurFn->back();
+  Builder.SetInsertPoint(EndBlock.begin());
+  while(!GlobalizedVars.empty()) {
+    CallInst *SharedAlloc = GlobalizedVars.top();
+    GlobalizedVars.pop();
+    FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr(
+      OMPRTL___kmpc_free_shared
+    );
+
+    Value *SharedFreeArgs[] = {
+      SharedAlloc,
+      SharedAlloc->getArgOperand(0)
+    };
+
+    CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2));
+    dbgs() << " Freed - " << *SharedFreeCall << "\n";
+  }
+
+  for(auto I : ToBeDeleted)
+    I->removeFromParent();
+
+}
+
+// Globalize any variables that are needed in a lower level of
+// the parallel hierarchy.
+// Only Vars used in 'simd' regions are supported right now.
+void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn)
+{
+
+  std::stack<llvm::AllocaInst> Allocas;
+  SmallPtrSet<AllocaInst*, 32> EscapedVars;
+
+  //dbgs() << "Function: " << CurFn->getName() << "\n";
+
+  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+  {
+    for(auto I = BB->begin(); I != BB->end(); I++)
+    {
+      //dbgs() << "  Instruction: " << *I << "\n";
+      if(auto FnCall = dyn_cast<CallInst>(I))
+      {
+        //dbgs() << "    Found call: " << *FnCall << "\n";
+        if(auto Fn = FnCall->getCalledFunction()) {
+          //dbgs() << "      " << Fn->getName() << "\n";
+          if(Fn->getName() == "__kmpc_parallel_51") {
+            //dbgs() << "        Parallel!\n";
+            
+            Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5));
+            assert(OutlinedFn && "failed to find GPU parallel outlined fn");
+
+
+            dbgs() << "Found a parallel region\n";
+            globalizeParallelVars(OutlinedFn);
+          }
+        }
+      }
+    }
+  }
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -924,16 +1087,12 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
-                               bool ForceSimpleCall, bool CheckCancelFlag) {
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
+                               bool ForceSimpleCall, bool CheckCancelFlag,
+                               Value *ThreadID) {
   if (!updateToLocation(Loc))
     return Loc.IP;
-  return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
-}
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
-                                 bool ForceSimpleCall, bool CheckCancelFlag) {
   // Build call __kmpc_cancel_barrier(loc, thread_id) or
   //            __kmpc_barrier(loc, thread_id);
 
@@ -958,9 +1117,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  Value *Args[] = {
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
-      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
+  if (!ThreadID)
+    ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+                   ThreadID};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -1338,6 +1499,467 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
   }
 }
 
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+  LoopBodyCallbackTy BodyGenCB,
+  TripCountCallbackTy DistanceCB,
+  PrivatizeCallbackTy PrivCB,
+  FinalizeCallbackTy FiniCB
+)
+{
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  BasicBlock *InsertBB = Builder.GetInsertBlock();
+  Function *OuterFn = InsertBB->getParent();
+
+  LLVM_DEBUG(dbgs() << "At the start of createSimdLoop:\n" << *OuterFn << "\n");
+
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+  // Vector to remember instructions we used only during the modeling but which
+  // we want to delete at the end.
+  SmallVector<Instruction *, 16> ToBeDeleted;
+
+  // Create an artificial insertion point that will also ensure the blocks we
+  // are about to split are not degenerated.
+  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+  Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+  BasicBlock *ThenBB = ThenTI->getParent();
+
+  // Alloca block for simd
+  BasicBlock *EntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.simd.entry");
+
+  // Block for setup related to simd
+  // i.e variable privatizaiton, trip count, reductions
+  BasicBlock *PrologBB = EntryBB->splitBasicBlock(ThenTI, "omp.simd.prolog");
+
+  // Entry block for the outlined loop body
+  // Allocas from the loop body should be done here
+  BasicBlock *LoopEntryBB = PrologBB->splitBasicBlock(ThenTI, "omp.simd.loop.entry");
+
+  // Block for generating the loop body
+  BasicBlock *LoopBodyBB = LoopEntryBB->splitBasicBlock(ThenTI, "omp.simd.loop.body");
+
+  BasicBlock *LoopPreFiniBB =
+    LoopBodyBB->splitBasicBlock(ThenTI, "omp.simd.loop.pre_finalize");
+
+  BasicBlock *LoopExitBB =
+    LoopPreFiniBB->splitBasicBlock(ThenTI, "omp.simd.loop.outlined.exit");
+
+  // Block for finalizing any reductions
+  BasicBlock *ReductionEpilogBB =
+    LoopExitBB->splitBasicBlock(ThenTI, "omp.reduction.epilog");
+
+  BasicBlock *FinalizeBB =
+    ReductionEpilogBB->splitBasicBlock(ThenTI, "omp.simd.finalize");
+
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    // Hide "open-ended" blocks from the given FiniCB by setting the right jump
+    // target to the region exit blocks
+    if (IP.getBlock()->end() == IP.getPoint()) {
+      IRBuilder<>::InsertPointGuard IPG(Builder);
+      Builder.restoreIP(IP);
+      Instruction *I = Builder.CreateBr(FinalizeBB); //PRegExitBB);
+      IP = InsertPointTy(I->getParent(), I->getIterator());
+    }
+    assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
+           IP.getBlock()->getTerminator()->getSuccessor(0) == FinalizeBB && //PRegExitBB &&
+           "Unexpected insertion point for finalization call!");
+    return FiniCB(IP);
+  };
+
+  FinalizationStack.push_back({FiniCBWrapper, OMPD_simd, false});
+
+  // Compute the loop trip count
+  // Insert after the outer alloca to ensure all variables needed
+  // in its calculation are ready
+  
+  InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
+  assert(DistanceCB && "expected loop trip count callback function!");
+  Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+  assert(DistVal && "trip count call back should return integer trip count");
+  Type *DistValType = DistVal->getType();
+  assert(DistValType->isIntegerTy() && "trip count should be integer type");
+
+  LLVM_DEBUG(dbgs() << "After DistanceCB:\n" << *PrologBB << "\n");
+  LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n");
+
+  // Create the virtual iteration variable that will be pulled into
+  // the outlined function.
+  //Builder.restoreIP(OuterAllocaIP);
+  Builder.SetInsertPoint(EntryBB, EntryBB->begin());
+  AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp");
+  Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv");
+  //InsertPointTy MidAllocaIP = Builder.saveIP();
+
+  // Generate the privatization allocas in the block that will become the entry
+  // of the outlined function.
+//  Builder.SetInsertPoint(LoopEntryBB->getTerminator());
+  Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin());
+  // Use omp.iv in the outlined region so it gets captured during the outline
+  Instruction *OMPIVUse = dyn_cast<Instruction>(
+    Builder.CreateAdd(OMPIV, OMPIV, "omp.iv.tobedeleted"));
+  InsertPointTy InnerAllocaIP = Builder.saveIP();
+
+  // All of the temporary omp.iv variables need to be deleted later
+  // Order matters
+  ToBeDeleted.push_back(OMPIVUse);
+  ToBeDeleted.push_back(OMPIV);
+  ToBeDeleted.push_back(OMPIVAlloca);
+
+  LLVM_DEBUG(dbgs() << "omp.iv variable generated:\n" << *OuterFn << "\n");
+
+  LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n");
+  assert(BodyGenCB && "Expected body generation callback!");
+  InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin());
+
+  InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator());
+  InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin());
+
+  // Generate the body of the loop. The omp.iv variable is a value between 
+  // 0 <= omp.iv < TripCount
+  // If a loop variable is needed, then this callback function can initialize
+  // it based on the omp.iv.
+  BodyGenCB(EntryBB, InnerAllocaIP, CodeGenIP, PrologIP, ReductionEpilogIP, OMPIV);
+
+  LLVM_DEBUG(dbgs() << "After body codegen:\n" << *OuterFn << "\n");
+
+  // Determine what runtime function should be called based on the type
+  // of the trip count
+  //FunctionCallee RTLFn; 
+
+  // Outline 1
+  {
+    OutlineInfo OI;
+
+    // Adjust the finalization stack, verify the adjustment, and call the
+    // finalize function a last time to finalize values between the pre-fini
+    // block and the exit block if we left the parallel "the normal way".
+    //auto FiniInfo = FinalizationStack.pop_back_val();
+    //(void)FiniInfo;
+    //assert(FiniInfo.DK == OMPD_simd && 
+    //       "Unexpected finalization stack state!");
+
+    Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator();
+
+    InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator());
+    FiniCB(PreFiniIP);
+
+    OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock;
+    OI.EntryBB = LoopEntryBB;
+    OI.ExitBB = LoopExitBB;
+
+    SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+    SmallVector<BasicBlock *, 32> Blocks;
+    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+    // Ensure a single exit node for the outlined region by creating one.
+    // We might have multiple incoming edges to the exit now due to finalizations,
+    // e.g., cancel calls that cause the control flow to leave the region.
+    //BasicBlock *PRegOutlinedExitBB = PRegExitBB;
+    //PRegExitBB = LRegExitBB;
+    //PRegOutlinedExitBB->setName("omp.loop.outlined.exit");
+
+    Blocks.push_back(LoopExitBB);
+
+    CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                            /* AggregateArgs */ true,
+                            /* BlockFrequencyInfo */ nullptr,
+                            /* BranchProbabilityInfo */ nullptr,
+                            /* AssumptionCache */ nullptr,
+                            /* AllowVarArgs */ false,
+                            /* AllowAlloca */ true,
+                            /* AllocationBlock */ EntryBB, //OuterAllocaBlock,
+                            /* Suffix */ ".omp_simd");
+
+    BasicBlock *CommonExit = nullptr;
+    SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+    Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+    Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+    auto PrivHelper = [&](Value &V) {
+      // Exclude omp.iv from aggregate
+      if (&V == OMPIV) {
+        OI.ExcludeArgsFromAggregate.push_back(&V);
+        return;
+      }
+
+      // Get all uses of value that are inside of the outlined region
+      SetVector<Use *> Uses;
+      for (Use &U : V.uses())
+        if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+          if (ParallelRegionBlockSet.count(UserI->getParent()))
+            Uses.insert(&U);
+
+      Value *Inner = &V;
+
+      // If the value isn't a pointer type, store it in a pointer
+      // Unpack it inside the outlined region
+      if (!V.getType()->isPointerTy()) {
+        IRBuilder<>::InsertPointGuard Guard(Builder);
+        LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+        Builder.restoreIP(OuterAllocaIP);
+        Value *Ptr = Builder.CreateAlloca(
+          V.getType(), nullptr, V.getName() + ".reloaded");
+
+        // Store to stack at end of the block that currently branches to the entry
+        // block of the to-be-outlined region.
+        Builder.SetInsertPoint(
+          InsertBB, InsertBB->getTerminator()->getIterator());
+        Builder.CreateStore(&V, Ptr);
+
+        // Load back next to allocations in the to-be-outlined region.
+        Builder.restoreIP(InnerAllocaIP);
+        Inner = Builder.CreateLoad(V.getType(), Ptr);
+      }
+
+      Value *ReplacementValue = nullptr;
+      Builder.restoreIP(
+        PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+      assert(ReplacementValue &&
+        "Expected copy/create callback to set replacement value!");
+      if (ReplacementValue == &V)
+        return;
+
+      for (Use *UPtr : Uses)
+        UPtr->set(ReplacementValue);
+
+    };
+
+    LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+    InnerAllocaIP = IRBuilder<>::InsertPoint(
+        OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+    // Reset the outer alloca insertion point to the entry of the relevant block
+    // in case it was invalidated.
+    OuterAllocaIP = IRBuilder<>::InsertPoint(
+      OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+    for (Value *Input : Inputs) {
+      PrivHelper(*Input);
+    }
+
+    assert(Outputs.empty() &&
+      "OpenMP outlining should not produce live-out values!");
+
+    LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
+    for (auto *BB : Blocks) {
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+    }
+
+    int NumInputs = Inputs.size()-1; // One argument is always omp.iv
+    OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+      OutlinedFn.addFnAttr(Attribute::NoUnwind);
+      OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+      assert(OutlinedFn.arg_size() == 2 &&
+             "Expected omp.iv & structArg as arguments");
+
+      CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+      BasicBlock *CallBlock = CI->getParent();
+      CallBlock->setName("omp_loop");
+      Builder.SetInsertPoint(CI);
+
+      Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv
+
+      Value *SimdArgs[] = {
+          Ident,
+          Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr),
+          DistVal,
+          Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr)};
+
+      SmallVector<Value *, 16> RealArgs;
+      RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+      FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+        (DistValType->isIntegerTy(32) ? OMPRTL___kmpc_simd_4u :
+                                        OMPRTL___kmpc_simd_8u));
+      Builder.CreateCall(RTLFn, RealArgs);
+
+      LLVM_DEBUG(dbgs() << "With kmpc_simd_4u call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+      CI->eraseFromParent();
+
+      for (Instruction *I : ToBeDeleted)
+        I->eraseFromParent();
+
+    };
+
+    addOutlineInfo(std::move(OI));
+  }
+
+
+// Outline 2
+  if(false) { // if(!SPMD) {
+    OutlineInfo OI;
+
+    OI.OuterAllocaBB = OuterAllocaBlock;
+    OI.EntryBB = EntryBB; //LoopEntryBB;
+    OI.ExitBB = FinalizeBB; //LoopExitBB;
+
+    SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+    SmallVector<BasicBlock *, 32> Blocks;
+    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+    CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                            /* AggregateArgs */ true,
+                            /* BlockFrequencyInfo */ nullptr,
+                            /* BranchProbabilityInfo */ nullptr,
+                            /* AssumptionCache */ nullptr,
+                            /* AllowVarArgs */ false,
+                            /* AllowAlloca */ true,
+                            /* AllocationBlock */ OuterAllocaBlock,
+                            /* Suffix */ ".omp_simd");
+
+    BasicBlock *CommonExit = nullptr;
+    SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+    Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+    Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+    auto PrivHelper = [&](Value &V) {
+      // Exclude omp.iv from aggregate
+      //if (&V == OMPIV) {
+      //  OI.ExcludeArgsFromAggregate.push_back(&V);
+      //  return;
+      //}
+
+      // Get all uses of value that are inside of the outlined region
+      SetVector<Use *> Uses;
+      for (Use &U : V.uses())
+        if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+          if (ParallelRegionBlockSet.count(UserI->getParent()))
+            Uses.insert(&U);
+
+      Value *Inner = &V;
+
+      // If the value isn't a pointer type, store it in a pointer
+      // Unpack it inside the outlined region
+      if (!V.getType()->isPointerTy()) {
+        IRBuilder<>::InsertPointGuard Guard(Builder);
+        LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+        Builder.restoreIP(OuterAllocaIP);
+        Value *Ptr = Builder.CreateAlloca(
+          V.getType(), nullptr, V.getName() + ".reloaded");
+
+        // Store to stack at end of the block that currently branches to the entry
+        // block of the to-be-outlined region.
+        Builder.SetInsertPoint(
+          InsertBB, InsertBB->getTerminator()->getIterator());
+        Builder.CreateStore(&V, Ptr);
+
+        // Load back next to allocations in the to-be-outlined region.
+        Builder.restoreIP(InnerAllocaIP);
+        Inner = Builder.CreateLoad(V.getType(), Ptr);
+      }
+
+      Value *ReplacementValue = nullptr;
+      Builder.restoreIP(
+        PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+      assert(ReplacementValue &&
+        "Expected copy/create callback to set replacement value!");
+      if (ReplacementValue == &V)
+        return;
+
+      for (Use *UPtr : Uses)
+        UPtr->set(ReplacementValue);
+
+    };
+
+    LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+    InnerAllocaIP = IRBuilder<>::InsertPoint(
+        OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+    // Reset the outer alloca insertion point to the entry of the relevant block
+    // in case it was invalidated.
+    OuterAllocaIP = IRBuilder<>::InsertPoint(
+      OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+    for (Value *Input : Inputs) {
+      PrivHelper(*Input);
+    }
+
+    assert(Outputs.empty() &&
+      "OpenMP outlining should not produce live-out values!");
+
+    LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
+    for (auto *BB : Blocks) {
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+    }
+
+    int NumInputs = Inputs.size();
+
+    OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+      OutlinedFn.addFnAttr(Attribute::NoUnwind);
+      OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+      assert(OutlinedFn.arg_size() == 1 &&
+             "Expected structArg as arguments");
+
+      CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+      BasicBlock *CallBlock = CI->getParent();
+      CallBlock->setName("omp_simd");
+      Builder.SetInsertPoint(CI);
+
+      Value * StructArg = CI->getArgOperand(0);
+
+      Value *SimdArgs[] = {
+          Ident,
+          Builder.CreateBitCast(&OutlinedFn, SimdTaskPtr),
+          Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr),
+          Builder.getInt32(NumInputs)};
+
+      SmallVector<Value *, 16> RealArgs;
+      RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+      FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+        OMPRTL___kmpc_simd);
+      Builder.CreateCall(RTLFn, RealArgs);
+
+      LLVM_DEBUG(dbgs() << "With __kmpc_simd call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+      CI->eraseFromParent();
+
+      for (Instruction *I : ToBeDeleted)
+        I->eraseFromParent();
+
+    };
+
+    addOutlineInfo(std::move(OI));
+  }
+
+
+
+
+
+  InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end());
+  UI->eraseFromParent();
+
+  return AfterIP;
+
+}
+
+
 IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
     const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
     BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
@@ -1634,7 +2256,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
   LLVM_DEBUG({
     for (auto *BB : Blocks)
-      dbgs() << " PBR: " << BB->getName() << "\n";
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
   });
 
   // Register the outlined info.
@@ -2096,18 +2718,1463 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
                               /*IsCancellable*/ true);
 }
 
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-Function *getFreshReductionFunc(Module &M) {
+static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
+  BasicBlock::iterator IT(I);
+  IT++;
+  return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
+}
+
+void OpenMPIRBuilder::emitUsed(StringRef Name,
+                               std::vector<WeakTrackingVH> &List) {
+  if (List.empty())
+    return;
+
+  // Convert List to what ConstantArray needs.
+  SmallVector<Constant *, 8> UsedArray;
+  UsedArray.resize(List.size());
+  for (unsigned I = 0, E = List.size(); I != E; ++I)
+    UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+  if (UsedArray.empty())
+    return;
+  ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+  auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, UsedArray), Name);
+
+  GV->setSection("llvm.metadata");
+}
+
+Value *OpenMPIRBuilder::getGPUThreadID() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M,
+                                 OMPRTL___kmpc_get_hardware_thread_id_in_block),
+      {});
+}
+
+Value *OpenMPIRBuilder::getGPUWarpSize() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
+}
+
+Value *OpenMPIRBuilder::getNVPTXWarpID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+  return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
+}
+
+Value *OpenMPIRBuilder::getNVPTXLaneID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
+  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
+  return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
+                           "nvptx_lane_id");
+}
+
+Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
+                                        Type *ToType) {
+  Type *FromType = From->getType();
+  uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
+  uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
+  assert(FromSize > 0 && "From size must be greater than zero");
+  assert(ToSize > 0 && "To size must be greater than zero");
+  if (FromType == ToType)
+    return From;
+  if (FromSize == ToSize)
+    return Builder.CreateBitCast(From, ToType);
+  if (ToType->isIntegerTy() && FromType->isIntegerTy())
+    return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
+  InsertPointTy SaveIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *CastItem = Builder.CreateAlloca(ToType);
+  Builder.restoreIP(SaveIP);
+
+  Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      CastItem, FromType->getPointerTo());
+  Builder.CreateStore(From, ValCastItem);
+  return Builder.CreateLoad(ToType, CastItem);
+}
+
+Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
+                                                     Value *Element,
+                                                     Type *ElementType,
+                                                     Value *Offset) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
+  assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
+
+  // Cast all types to 32- or 64-bit values before calling shuffle routines.
+  Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+  Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
+  Value *WarpSize =
+      Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
+  Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
+      Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
+                : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
+  Value *WarpSizeCast =
+      Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
+  Value *ShuffleCall =
+      Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
+  return castValueToType(AllocaIP, ShuffleCall, CastTy);
+}
+
+void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
+                                      Value *DstAddr, Type *ElemType,
+                                      Value *Offset, Type *ReductionArrayTy) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
+  // Create the loop over the big sized data.
+  // ptr = (void*)Elem;
+  // ptrEnd = (void*) Elem + 1;
+  // Step = 8;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int64_t)*ptr);
+  // Step = 4;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int32_t)*ptr);
+  // ...
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  Value *ElemPtr = DstAddr;
+  Value *Ptr = SrcAddr;
+  for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
+    if (Size < IntSize)
+      continue;
+    Type *IntType = Builder.getIntNTy(IntSize * 8);
+    Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+    Value *SrcAddrGEP =
+        Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
+    ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+
+    Function *CurFunc = Builder.GetInsertBlock()->getParent();
+    if ((Size / IntSize) > 1) {
+      Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          SrcAddrGEP, Builder.getPtrTy());
+      BasicBlock *PreCondBB =
+          BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
+      BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
+      BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
+      BasicBlock *CurrentBB = Builder.GetInsertBlock();
+      emitBlock(PreCondBB, CurFunc);
+      PHINode *PhiSrc =
+          Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
+      PhiSrc->addIncoming(Ptr, CurrentBB);
+      PHINode *PhiDest =
+          Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
+      PhiDest->addIncoming(ElemPtr, CurrentBB);
+      Ptr = PhiSrc;
+      ElemPtr = PhiDest;
+      Value *PtrDiff = Builder.CreatePtrDiff(
+          Builder.getInt8Ty(), PtrEnd,
+          Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
+      Builder.CreateCondBr(
+          Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
+          ExitBB);
+      emitBlock(ThenBB, CurFunc);
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP,
+          Builder.CreateAlignedLoad(
+              IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
+          IntType, Offset);
+      Builder.CreateAlignedStore(Res, ElemPtr,
+                                 M.getDataLayout().getPrefTypeAlign(ElemType));
+      Value *LocalPtr =
+          Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      Value *LocalElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+      PhiSrc->addIncoming(LocalPtr, ThenBB);
+      PhiDest->addIncoming(LocalElemPtr, ThenBB);
+      emitBranch(PreCondBB);
+      emitBlock(ExitBB, CurFunc);
+    } else {
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
+      if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
+                                         Res->getType()->getScalarSizeInBits())
+        Res = Builder.CreateTrunc(Res, ElemType);
+      Builder.CreateStore(Res, ElemPtr);
+      Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      ElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+    }
+    Size = Size % IntSize;
+  }
+}
+
+void OpenMPIRBuilder::emitReductionListCopy(
+    InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+    ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+    CopyOptionsTy CopyOptions) {
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+
+  // Iterates, element-by-element, through the source Reduce list and
+  // make a copy.
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *SrcElementAddr = nullptr;
+    Value *DestElementAddr = nullptr;
+    Value *DestElementPtrAddr = nullptr;
+    // Should we shuffle in an element from a remote lane?
+    bool ShuffleInElement = false;
+    // Set to true to update the pointer in the dest Reduce list to a
+    // newly created element.
+    bool UpdateDestListPtr = false;
+
+    // Step 1.1: Get the address for the src element in the Reduce list.
+    Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, SrcBase,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
+
+    // Step 1.2: Create a temporary to store the element in the destination
+    // Reduce list.
+    DestElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, DestBase,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    switch (Action) {
+    case CopyAction::RemoteLaneToThread: {
+      InsertPointTy CurIP = Builder.saveIP();
+      Builder.restoreIP(AllocaIP);
+      AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
+                                                    ".omp.reduction.element");
+      DestAlloca->setAlignment(
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+      DestElementAddr = DestAlloca;
+      DestElementAddr =
+          Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
+                                      DestElementAddr->getName() + ".ascast");
+      Builder.restoreIP(CurIP);
+      ShuffleInElement = true;
+      UpdateDestListPtr = true;
+      break;
+    }
+    case CopyAction::ThreadCopy: {
+      DestElementAddr =
+          Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
+      break;
+    }
+    }
+
+    // Now that all active lanes have read the element in the
+    // Reduce list, shuffle over the value from the remote lane.
+    if (ShuffleInElement) {
+      shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
+                      RemoteLaneOffset, ReductionArrayTy);
+    } else {
+      switch (RI.EvaluationKind) {
+      case EvaluationKindTy::Scalar: {
+        Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+        // Store the source element value to the dest element address.
+        Builder.CreateStore(Elem, DestElementAddr);
+        break;
+      }
+      case EvaluationKindTy::Complex: {
+        Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, SrcElementAddr, 0, 0, ".realp");
+        Value *SrcReal = Builder.CreateLoad(
+            RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+        Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
+        Value *SrcImg = Builder.CreateLoad(
+            RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+        Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, DestElementAddr, 0, 0, ".realp");
+        Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+            RI.ElementType, DestElementAddr, 0, 1, ".imagp");
+        Builder.CreateStore(SrcReal, DestRealPtr);
+        Builder.CreateStore(SrcImg, DestImgPtr);
+        break;
+      }
+      case EvaluationKindTy::Aggregate: {
+        Value *SizeVal = Builder.getInt64(
+            M.getDataLayout().getTypeStoreSize(RI.ElementType));
+        Builder.CreateMemCpy(
+            DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SizeVal, false);
+        break;
+      }
+      };
+    }
+
+    // Step 3.1: Modify reference in dest Reduce list as needed.
+    // Modifying the reference in Reduce list to point to the newly
+    // created element.  The element is live in the current function
+    // scope and that of functions it invokes (i.e., reduce_function).
+    // RemoteReduceData[i] = (void*)&RemoteElem
+    if (UpdateDestListPtr) {
+      Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          DestElementAddr, Builder.getPtrTy(),
+          DestElementAddr->getName() + ".ascast");
+      Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
+    }
+  }
+}
+
+Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
+    const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
+    AttributeList FuncAttrs) {
+  InsertPointTy SavedIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
+      /* IsVarArg */ false);
+  Function *WcFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_inter_warp_copy_func", &M);
+  WcFunc->setAttributes(FuncAttrs);
+  WcFunc->addParamAttr(0, Attribute::NoUndef);
+  WcFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // ReduceList: thread local Reduce list.
+  // At the stage of the computation when this function is called, partially
+  // aggregated values reside in the first lane of every active warp.
+  Argument *ReduceListArg = WcFunc->getArg(0);
+  // NumWarps: number of warps active in the parallel region.  This could
+  // be smaller than 32 (max warps in a CTA) for partial block reduction.
+  Argument *NumWarpsArg = WcFunc->getArg(1);
+
+  // This array is used as a medium to transfer, one reduce element at a time,
+  // the data from the first lane of every warp to lanes in the first warp
+  // in order to perform the final step of a reduction in a parallel region
+  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
+  // for reduced latency, as well as to have a distinct copy for concurrently
+  // executing target regions.  The array is declared with common linkage so
+  // as to be shared across compilation units.
+  StringRef TransferMediumName =
+      "__openmp_nvptx_data_transfer_temporary_storage";
+  GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+  unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
+  ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
+  if (!TransferMedium) {
+    TransferMedium = new GlobalVariable(
+        M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
+        UndefValue::get(ArrayTy), TransferMediumName,
+        /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+        /*AddressSpace=*/3);
+  }
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+
+  // Get the CUDA thread id of the current OpenMP thread on the GPU.
+  Value *GPUThreadID = getGPUThreadID();
+  // nvptx_lane_id = nvptx_id % warpsize
+  Value *LaneID = getNVPTXLaneID();
+  // nvptx_warp_id = nvptx_id / warpsize
+  Value *WarpID = getNVPTXWarpID();
+
+  InsertPointTy AllocaIP =
+      InsertPointTy(Builder.GetInsertBlock(),
+                    Builder.GetInsertBlock()->getFirstInsertionPt());
+  Type *Arg0Type = ReduceListArg->getType();
+  Type *Arg1Type = NumWarpsArg->getType();
+  Builder.restoreIP(AllocaIP);
+  AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
+      Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
+  AllocaInst *NumWarpsAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
+  Value *ThreadID =
+      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
+  Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      NumWarpsAlloca, Arg1Type->getPointerTo(),
+      NumWarpsAlloca->getName() + ".ascast");
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
+  AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
+  InsertPointTy CodeGenIP =
+      getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
+  Builder.restoreIP(CodeGenIP);
+
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
+
+  for (auto En : enumerate(ReductionInfos)) {
+    //
+    // Warp master copies reduce element to transfer medium in __shared__
+    // memory.
+    //
+    const ReductionInfo &RI = En.value();
+    unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
+    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+      Type *CType = Builder.getIntNTy(TySize * 8);
+
+      unsigned NumIters = RealTySize / TySize;
+      if (NumIters == 0)
+        continue;
+      Value *Cnt = nullptr;
+      Value *CntAddr = nullptr;
+      BasicBlock *PrecondBB = nullptr;
+      BasicBlock *ExitBB = nullptr;
+      if (NumIters > 1) {
+        CodeGenIP = Builder.saveIP();
+        Builder.restoreIP(AllocaIP);
+        CntAddr =
+            Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
+
+        CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
+                                              CntAddr->getName() + ".ascast");
+        Builder.restoreIP(CodeGenIP);
+        Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
+                            CntAddr,
+                            /*Volatile=*/false);
+        PrecondBB = BasicBlock::Create(Ctx, "precond");
+        ExitBB = BasicBlock::Create(Ctx, "exit");
+        BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
+        emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
+        Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
+                                 /*Volatile=*/false);
+        Value *Cmp = Builder.CreateICmpULT(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
+        Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
+        emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
+      }
+
+      // kmpc_barrier.
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true, ThreadID);
+      BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+      // if (lane_id  == 0)
+      Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
+      Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+      emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+
+      // Reduce element = LocalReduceList[i]
+      auto *RedListArrayTy =
+          ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+      Type *IndexTy = Builder.getIndexTy(
+          M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+      Value *ElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      // elemptr = ((CopyType*)(elemptrptr)) + I
+      Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+      if (NumIters > 1)
+        ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
+
+      // Get pointer to location in transfer medium.
+      // MediumPtr = &medium[warp_id]
+      Value *MediumPtr = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
+      // elem = *elemptr
+      //*MediumPtr = elem
+      Value *Elem = Builder.CreateLoad(CType, ElemPtr);
+      // Store the source element value to the dest element address.
+      Builder.CreateStore(Elem, MediumPtr,
+                          /*IsVolatile*/ true);
+      Builder.CreateBr(MergeBB);
+
+      // else
+      emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+      Builder.CreateBr(MergeBB);
+
+      // endif
+      emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true, ThreadID);
+
+      // Warp 0 copies reduce element from transfer medium
+      BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+      Value *NumWarpsVal =
+          Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
+      // Up to 32 threads in warp 0 are active.
+      Value *IsActiveThread =
+          Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
+      Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+      emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
+
+      // SecMediumPtr = &medium[tid]
+      // SrcMediumVal = *SrcMediumPtr
+      Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
+      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+      Value *TargetElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      Value *TargetElemPtrVal =
+          Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
+      Value *TargetElemPtr = TargetElemPtrVal;
+      if (NumIters > 1)
+        TargetElemPtr =
+            Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
+
+      // *TargetElemPtr = SrcMediumVal;
+      Value *SrcMediumValue =
+          Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
+      Builder.CreateStore(SrcMediumValue, TargetElemPtr);
+      Builder.CreateBr(W0MergeBB);
+
+      emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
+      Builder.CreateBr(W0MergeBB);
+
+      emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
+
+      if (NumIters > 1) {
+        Cnt = Builder.CreateNSWAdd(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
+        Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
+
+        auto *CurFn = Builder.GetInsertBlock()->getParent();
+        emitBranch(PrecondBB);
+        emitBlock(ExitBB, CurFn);
+      }
+      RealTySize %= TySize;
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(SavedIP);
+
+  return WcFunc;
+}
+
+Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    AttributeList FuncAttrs) {
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy =
+      FunctionType::get(Builder.getVoidTy(),
+                        {Builder.getPtrTy(), Builder.getInt16Ty(),
+                         Builder.getInt16Ty(), Builder.getInt16Ty()},
+                        /* IsVarArg */ false);
+  Function *SarFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_shuffle_and_reduce_func", &M);
+  SarFunc->setAttributes(FuncAttrs);
+  SarFunc->addParamAttr(0, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::NoUndef);
+  SarFunc->addParamAttr(2, Attribute::NoUndef);
+  SarFunc->addParamAttr(3, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::SExt);
+  SarFunc->addParamAttr(2, Attribute::SExt);
+  SarFunc->addParamAttr(3, Attribute::SExt);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Thread local Reduce list used to host the values of data to be reduced.
+  Argument *ReduceListArg = SarFunc->getArg(0);
+  // Current lane id; could be logical.
+  Argument *LaneIDArg = SarFunc->getArg(1);
+  // Offset of the remote source lane relative to the current lane.
+  Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
+  // Algorithm version.  This is expected to be known at compile time.
+  Argument *AlgoVerArg = SarFunc->getArg(3);
+
+  Type *ReduceListArgType = ReduceListArg->getType();
+  Type *LaneIDArgType = LaneIDArg->getType();
+  Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+  Value *ReduceListAlloca = Builder.CreateAlloca(
+      ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
+  Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                             LaneIDArg->getName() + ".addr");
+  Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
+      LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
+  Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                              AlgoVerArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // Create a local thread-private variable to host the Reduce list
+  // from a remote lane.
+  Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+      RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, ReduceListArgType,
+      ReduceListAlloca->getName() + ".ascast");
+  Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
+  Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteLaneOffsetAlloca, LaneIDArgPtrType,
+      RemoteLaneOffsetAlloca->getName() + ".ascast");
+  Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
+  Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteReductionListAlloca, Builder.getPtrTy(),
+      RemoteReductionListAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
+  Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
+  Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
+
+  Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
+  Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
+  Value *RemoteLaneOffset =
+      Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
+  Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
+
+  InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
+
+  // This loop iterates through the list of reduce elements and copies,
+  // element by element, from a remote lane in the warp to RemoteReduceList,
+  // hosted on the thread's stack.
+  emitReductionListCopy(
+      AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+      ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+
+  // The actions to be performed on the Remote Reduce list is dependent
+  // on the algorithm version.
+  //
+  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+  //  LaneId % 2 == 0 && Offset > 0):
+  //    do the reduction value aggregation
+  //
+  //  The thread local variable Reduce list is mutated in place to host the
+  //  reduced data, which is the aggregated value produced from local and
+  //  remote lanes.
+  //
+  //  Note that AlgoVer is expected to be a constant integer known at compile
+  //  time.
+  //  When AlgoVer==0, the first conjunction evaluates to true, making
+  //    the entire predicate true during compile time.
+  //  When AlgoVer==1, the second conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  //  When AlgoVer==2, the third conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+  Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+  Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+  Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+  Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+  Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+  Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+  Value *RemoteOffsetComp =
+      Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+  Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+  Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+  Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+  Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+  emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+  // reduce_function(LocalReduceList, RemoteReduceList)
+  Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceList, Builder.getPtrTy());
+  Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteListAddrCast, Builder.getPtrTy());
+  Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+
+  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
+  // Reduce list.
+  Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+  Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
+
+  BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
+  Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+  emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
+  emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
+                        ReductionInfos, RemoteListAddrCast, ReduceList);
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
+
+  Builder.CreateRetVoid();
+
+  return SarFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_copy_func", &M);
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferArgVal =
+      Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
+    Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+    switch (RI.EvaluationKind) {
+    case EvaluationKindTy::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
+      Builder.CreateStore(TargetElement, GlobVal);
+      break;
+    }
+    case EvaluationKindTy::Complex: {
+      Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 0, ".realp");
+      Value *SrcReal = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+      Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 1, ".imagp");
+      Value *SrcImg = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+      Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobVal, 0, 0, ".realp");
+      Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobVal, 0, 1, ".imagp");
+      Builder.CreateStore(SrcReal, DestRealPtr);
+      Builder.CreateStore(SrcImg, DestImgPtr);
+      break;
+    }
+    case EvaluationKindTy::Aggregate: {
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
+      break;
+    }
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_reduce_func", &M);
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  auto *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceListAddrCast,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    // Global = Buffer.VD[Idx];
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  // Call reduce_function(GlobalReduceList, ReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_copy_func", &M);
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+    switch (RI.EvaluationKind) {
+    case EvaluationKindTy::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+      Builder.CreateStore(TargetElement, ElemPtr);
+      break;
+    }
+    case EvaluationKindTy::Complex: {
+      Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobValPtr, 0, 0, ".realp");
+      Value *SrcReal = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
+      Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, GlobValPtr, 0, 1, ".imagp");
+      Value *SrcImg = Builder.CreateLoad(
+          RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
+
+      Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 0, ".realp");
+      Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
+          RI.ElementType, ElemPtr, 0, 1, ".imagp");
+      Builder.CreateStore(SrcReal, DestRealPtr);
+      Builder.CreateStore(SrcImg, DestImgPtr);
+      break;
+    }
+    case EvaluationKindTy::Aggregate: {
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          SizeVal, false);
+      break;
+    }
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  auto *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_reduce_func", &M);
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // Global = Buffer.VD[Idx];
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  // Call reduce_function(ReduceList, GlobalReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
+  std::string Suffix =
+      createPlatformSpecificName({"omp", "reduction", "reduction_func"});
+  return (Name + Suffix).str();
+}
+
+Function *OpenMPIRBuilder::createReductionFunction(
+    StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+    ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
+  auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
+                                   {Builder.getPtrTy(), Builder.getPtrTy()},
+                                   /* IsVarArg */ false);
+  std::string Name = getReductionFuncName(ReducerName);
+  Function *ReductionFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
+  ReductionFunc->setAttributes(FuncAttrs);
+  ReductionFunc->addParamAttr(0, Attribute::NoUndef);
+  ReductionFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB =
+      BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Need to alloca memory here and deal with the pointers before getting
+  // LHS/RHS pointers out
+  Value *LHSArrayPtr = nullptr;
+  Value *RHSArrayPtr = nullptr;
+  Argument *Arg0 = ReductionFunc->getArg(0);
+  Argument *Arg1 = ReductionFunc->getArg(1);
+  Type *Arg0Type = Arg0->getType();
+  Type *Arg1Type = Arg1->getType();
+
+  Value *LHSAlloca =
+      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+  Value *RHSAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+  Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+  Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
+  Builder.CreateStore(Arg0, LHSAddrCast);
+  Builder.CreateStore(Arg1, RHSAddrCast);
+  LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+  RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  SmallVector<Value *> LHSPtrs, RHSPtrs;
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, RHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType(),
+        RHSI8Ptr->getName() + ".ascast");
+
+    Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, LHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
+
+    if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+      LHSPtrs.emplace_back(LHSPtr);
+      RHSPtrs.emplace_back(RHSPtr);
+    } else {
+      Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+      Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+      Value *Reduced;
+      RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+      if (!Builder.GetInsertBlock())
+        return ReductionFunc;
+      Builder.CreateStore(Reduced, LHSPtr);
+    }
+  }
+
+  if (ReductionGenCBTy == ReductionGenCBTy::Clang)
+    for (auto En : enumerate(ReductionInfos)) {
+      unsigned Index = En.index();
+      const ReductionInfo &RI = En.value();
+      Value *LHSFixupPtr, *RHSFixupPtr;
+      Builder.restoreIP(RI.ReductionGenClang(
+          Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSFixupPtr->replaceUsesWithIf(
+          LHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+      RHSFixupPtr->replaceUsesWithIf(
+          RHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+    }
+
+  Builder.CreateRetVoid();
+  return ReductionFunc;
+}
+
+static void
+checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+                    bool IsGPU) {
+  for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
+    (void)RI;
+    assert(RI.Variable && "expected non-null variable");
+    assert(RI.PrivateVariable && "expected non-null private variable");
+    assert((RI.ReductionGen || RI.ReductionGenClang) &&
+           "expected non-null reduction generator callback");
+    if (!IsGPU) {
+      assert(
+          RI.Variable->getType() == RI.PrivateVariable->getType() &&
+          "expected variables and their private equivalents to have the same "
+          "type");
+    }
+    assert(RI.Variable->getType()->isPointerTy() &&
+           "expected variables to be pointers");
+  }
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+    bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+    bool IsSimdReduction,
+    ReductionGenCBTy ReductionGenCBTy, std::optional<omp::GV> GridValue,
+    unsigned ReductionBufNum, Value *SrcLocInfo) {
+
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+  Builder.restoreIP(CodeGenIP);
+  checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+  LLVMContext &Ctx = M.getContext();
+
+  // Source location for the ident struct
+  if (!SrcLocInfo) {
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  }
+
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
+  Function *CurFunc = Builder.GetInsertBlock()->getParent();
+  AttributeList FuncAttrs;
+  AttrBuilder AttrBldr(Ctx);
+  for (auto Attr : CurFunc->getAttributes().getFnAttrs())
+    AttrBldr.addAttribute(Attr);
+  AttrBldr.removeAttribute(Attribute::OptimizeNone);
+  FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
+
+  Function *ReductionFunc = nullptr;
+  if (GLOBAL_ReductionFunc) {
+    ReductionFunc = GLOBAL_ReductionFunc;
+  } else {
+    CodeGenIP = Builder.saveIP();
+    ReductionFunc = createReductionFunction(
+        Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos,
+        ReductionGenCBTy, FuncAttrs);
+    Builder.restoreIP(CodeGenIP);
+  }
+
+  // Set the grid value in the config needed for lowering later on
+  if (GridValue.has_value())
+    Config.setGridValue(GridValue.value());
+  else
+    Config.setGridValue(getGridValue(T, ReductionFunc));
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+  Value *RTLoc =
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
+
+  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
+  // RedList, shuffle_reduce_func, interwarp_copy_func);
+  // or
+  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  Value *Res;
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  auto Size = ReductionInfos.size();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *RedArrayTy = ArrayType::get(PtrTy, Size);
+  CodeGenIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *ReductionListAlloca =
+      Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
+  Builder.restoreIP(CodeGenIP);
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *ElemPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *CastElem =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+    Builder.CreateStore(CastElem, ElemPtr);
+  }
+  CodeGenIP = Builder.saveIP();
+  Function *SarFunc =
+      emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+  Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+  Builder.restoreIP(CodeGenIP);
+
+  Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
+
+  unsigned MaxDataSize = 0;
+  SmallVector<Type *> ReductionTypeArgs;
+  for (auto En : enumerate(ReductionInfos)) {
+    auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
+    if (Size > MaxDataSize)
+      MaxDataSize = Size;
+    ReductionTypeArgs.emplace_back(En.value().ElementType);
+  }
+  Value *ReductionDataSize =
+      Builder.getInt64(MaxDataSize * ReductionInfos.size());
+  if(IsSimdReduction) {
+    Value *SarFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+    Value *WcFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+    //    RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2);
+    Res = Builder.CreateCall(SimdReduceFn, Args);
+  } else if (!IsTeamsReduction) {
+    Value *SarFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+    Value *WcFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Res = Builder.CreateCall(Pv2Ptr, Args);
+  } else {
+    CodeGenIP = Builder.saveIP();
+    StructType *ReductionsBufferTy = StructType::create(
+        Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
+    Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+    Function *LtGCFunc = emitListToGlobalCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *LtGRFunc = emitListToGlobalReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Function *GtLCFunc = emitGlobalToListCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *GtLRFunc = emitGlobalToListReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Builder.restoreIP(CodeGenIP);
+
+    Value *KernelTeamsReductionPtr = Builder.CreateCall(
+        RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
+
+    Value *Args3[] = {RTLoc,
+                      KernelTeamsReductionPtr,
+                      Builder.getInt32(ReductionBufNum),
+                      ReductionDataSize,
+                      RL,
+                      SarFunc,
+                      WcFunc,
+                      LtGCFunc,
+                      LtGRFunc,
+                      GtLCFunc,
+                      GtLRFunc};
+
+    Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
+    Res = Builder.CreateCall(TeamsReduceFn, Args3);
+  }
+
+  // 5. Build if (res == 1)
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
+  Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+  Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+
+  // 6. Build then branch: where we have reduced values in the master
+  //    thread in each team.
+  //    __kmpc_end_reduce{_nowait}(<gtid>);
+  //    break;
+  emitBlock(ThenBB, CurFunc);
+
+  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *LHS = RI.Variable;
+    Value *RHS =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+
+    if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+      Value *LHSPtr, *RHSPtr;
+      Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
+                                             &LHSPtr, &RHSPtr, CurFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+      RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+    } else {
+      // LHS = Builder.CreateLoad(LHS);
+      // LHS = Builder.CreateLoad(LHS);
+      // Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS));
+    }
+  }
+  emitBlock(ExitBB, CurFunc);
+
+  Config.setEmitLLVMUsed();
+
+  return Builder.saveIP();
+}
+
+static Function *getFreshReductionFunc(Module &M) {
   Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
   auto *FuncTy =
       FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
   return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                          M.getDataLayout().getDefaultGlobalsAddressSpace(),
                           ".omp.reduction.func", &M);
 }
 
+static void populateReductionFunction(
+    Function *ReductionFunc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    IRBuilder<> &Builder) {
+  Module *Module = ReductionFunc->getParent();
+  BasicBlock *ReductionFuncBlock =
+      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+  Builder.SetInsertPoint(ReductionFuncBlock);
+  Value *LHSArrayPtr = nullptr;
+  Value *RHSArrayPtr = nullptr;
+  LHSArrayPtr = ReductionFunc->getArg(0);
+  RHSArrayPtr = ReductionFunc->getArg(1);
+
+  unsigned NumReductions = ReductionInfos.size();
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, LHSArrayPtr, 0, En.index());
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType());
+    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, RHSArrayPtr, 0, En.index());
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType());
+    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+    Value *Reduced;
+    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+    if (!Builder.GetInsertBlock())
+      return;
+    Builder.CreateStore(Reduced, LHSPtr);
+  }
+  Builder.CreateRetVoid();
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
@@ -2126,6 +4193,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
   BasicBlock *InsertBlock = Loc.IP.getBlock();
   BasicBlock *ContinuationBlock =
       InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -2154,10 +4224,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   Module *Module = Func->getParent();
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  bool CanGenerateAtomic =
-      llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
-        return RI.AtomicReductionGen;
-      });
+  bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
+    return RI.AtomicReductionGen;
+  });
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
                                   CanGenerateAtomic
                                       ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
@@ -2167,7 +4236,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   const DataLayout &DL = Module->getDataLayout();
   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
-  Function *ReductionFunc = getFreshReductionFunc(*Module);
+  Function *ReductionFunc = getFreshReductionFunc(M);
   Value *Lock = getOMPCriticalRegionLock(".reduction");
   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -4615,7 +6684,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
                              Ident,
                              DynamicEnvironment,
                          });
-  Twine KernelEnvironmentName = KernelName + "_kernel_environment";
+  std::string KernelEnvironmentName =
+      (KernelName + "_kernel_environment").str();
   GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
       M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
       KernelEnvironmentInitializer, KernelEnvironmentName,
@@ -6575,6 +8645,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
       [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
           const TargetRegionEntryInfo &EntryInfo,
           const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
+
         // Generate metadata for target regions. Each entry of this metadata
         // contains:
         // - Entry 0 -> Kind of this type of metadata (0).
@@ -6912,7 +8983,6 @@ void OpenMPIRBuilder::registerTargetGlobalVariable(
     VarSize = M.getDataLayout().getPointerSize();
     Linkage = GlobalValue::WeakAnyLinkage;
   }
-
   OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
                                                       Flags, Linkage);
 }
@@ -7000,6 +9070,7 @@ bool OffloadEntriesInfoManager::empty() const {
 
 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) const {
+
   auto It = OffloadEntriesTargetRegionCount.find(
       getTargetRegionEntryCountKey(EntryInfo));
   if (It == OffloadEntriesTargetRegionCount.end())
@@ -7009,6 +9080,7 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
 
 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) {
+
   OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
       EntryInfo.Count + 1;
 }
@@ -7016,6 +9088,7 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
 /// Initialize target region entry.
 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
     const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
+
   OffloadEntriesTargetRegion[EntryInfo] =
       OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
                                    OMPTargetRegionEntryTargetRegion);
@@ -7025,6 +9098,7 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
     TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
     OMPTargetRegionEntryKind Flags) {
+
   assert(EntryInfo.Count == 0 && "expected default EntryInfo");
 
   // Update the EntryInfo with the next available count for this location.
@@ -7072,6 +9146,7 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
 
 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
     const OffloadTargetRegionEntryInfoActTy &Action) {
+
   // Scan all target region entries and perform the provided action.
   for (const auto &It : OffloadEntriesTargetRegion) {
     Action(It.first, It.second);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 6988292ac71561..ec82b0a553a18b 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1194,7 +1194,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   }
 
   StructType *StructArgTy = nullptr;
-  AllocaInst *Struct = nullptr;
+  //AllocaInst *Struct = nullptr;
+  Instruction *Struct = nullptr;
   unsigned NumAggregatedInputs = 0;
   if (AggregateArgs && !StructValues.empty()) {
     std::vector<Type *> ArgTypes;
@@ -1210,12 +1211,16 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
       auto *StructSpaceCast = new AddrSpaceCastInst(
-          Struct, PointerType ::get(Context, 0), "structArg.ascast");
+        Struct, PointerType ::get(Context, 0), "structArg.ascast");
       StructSpaceCast->insertAfter(Struct);
-      params.push_back(StructSpaceCast);
+      // There isn't really a point in generating this cast if you
+      // just aren't going to use it...
+      Struct = StructSpaceCast;
+      //params.push_back(StructSpaceCast);
     } else {
-      params.push_back(Struct);
+      //params.push_back(Struct);
     }
+    params.push_back(Struct);
     // Store aggregated inputs in the struct.
     for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
       if (inputs.contains(StructValues[i])) {
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index f4854ed3d16787..fb6d54afdc505f 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -167,6 +167,9 @@ double omp_get_wtick(void);
 
 double omp_get_wtime(void);
 ///}
+
+int omp_get_simd_lane(void);
+
 }
 
 extern "C" {
@@ -233,6 +236,12 @@ void __kmpc_target_deinit();
 ///{
 void *__kmpc_reduction_get_fixed_buffer();
 
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+                                           uint64_t reduce_data_size,
+                                           void *reduce_data,
+                                           ShuffleReductFnTy shflFct,
+                                           InterWarpCopyFnTy cpyFct);
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
@@ -257,6 +266,8 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
 
 void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
 
+void __kmpc_simd_barrier(void);
+
 void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
 
 void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb98..948bfbbf0642af 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -105,6 +105,13 @@ uint32_t getMaxTeamThreads(bool IsSPMD);
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();
 
+uint32_t getSimdLen();
+uint32_t getSimdGroup();
+uint32_t getSimdLane();
+bool isSimdLeader();
+uint32_t getNumSimdGroups();
+LaneMaskTy simdmask();
+
 } // namespace mapping
 
 } // namespace ompx
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 95d4c728016d24..80b3877ff9df1a 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -40,9 +40,9 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
 static void genericStateMachine(IdentTy *Ident) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
+
   do {
     ParallelRegionFnTy WorkFn = nullptr;
-
     // Wait for the signal that we have a new work function.
     synchronize::threads(atomic::seq_cst);
 
@@ -98,7 +98,9 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
   }
 
   if (mapping::isInitialThreadInLevel0(IsSPMD))
+  {
     return -1;
+  }
 
   // Enter the generic state machine if enabled and if this thread can possibly
   // be an active worker thread.
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f506..486a92d8f52170 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -322,6 +322,40 @@ uint32_t mapping::getNumberOfProcessorElements() {
   return static_cast<uint32_t>(config::getHardwareParallelism());
 }
 
+uint32_t mapping::getSimdLen() {
+  return 1;
+}
+
+uint32_t mapping::getSimdGroup() {
+  uint32_t SimdGroup = mapping::getThreadIdInBlock() / mapping::getSimdLen();
+  return SimdGroup;
+}
+
+uint32_t mapping::getSimdLane() {
+  uint32_t SimdId = mapping::getThreadIdInWarp() % mapping::getSimdLen();
+  return SimdId;
+}
+
+bool mapping::isSimdLeader() {
+  return !mapping::getSimdLane();
+}
+
+uint32_t mapping::getNumSimdGroups() {
+  //uint32_t NumGroups = mapping::getBlockSize() / mapping::getSimdLen();
+  uint32_t NumGroups = state::getEffectivePTeamSize() / mapping::getSimdLen();
+  return NumGroups;
+}
+
+LaneMaskTy mapping::simdmask() {
+  uint32_t GroupSize = mapping::getSimdLen();
+  uint32_t Group = mapping::getSimdGroup();
+  uint32_t WarpSize = mapping::getWarpSize();
+  LaneMaskTy Mask = ~(LaneMaskTy)0;
+  Mask = Mask >> (sizeof(LaneMaskTy)*8 - GroupSize);
+  Mask = Mask << (Group * GroupSize) % WarpSize;
+  return Mask;
+}
+
 ///}
 
 /// Execution mode
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 031a5ced255180..dc0490830884b2 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -87,8 +87,10 @@ extern "C" {
                                                    int32_t num_threads,
                                                    void *fn, void **args,
                                                    const int64_t nargs) {
+  //printf("SPMD mode\n");
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  NumThreads = NumThreads / mapping::getSimdLen();
   uint32_t PTeamSize =
       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
   // Avoid the race between the read of the `icv::Level` above and the write
@@ -101,6 +103,9 @@ extern "C" {
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, TId == 0, ident,
                                           /*ForceTeamState=*/true);
+    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+    //                                 1u, TId == 0, ident,
+    //                                 /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
                                      /*ForceTeamState=*/true);
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
@@ -119,7 +124,7 @@ extern "C" {
     // assumptions above.
     synchronize::threadsAligned(atomic::relaxed);
 
-    if (!PTeamSize || TId < PTeamSize)
+    if (!PTeamSize || (TId < PTeamSize*mapping::getSimdLen()))
       invokeMicrotask(TId, 0, fn, args, nargs);
 
     // Synchronize all threads at the end of a parallel region.
@@ -141,6 +146,8 @@ extern "C" {
   return;
 }
 
+
+
 [[clang::always_inline]] void
 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                    int32_t num_threads, int proc_bind, void *fn,
@@ -166,9 +173,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);
 
+  //printf("num_threads=%i\n", num_threads);
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  //printf("NumThreads=%i\n", NumThreads);
+  NumThreads = NumThreads / mapping::getSimdLen();
+  //printf("New NumThreads=%i\n", NumThreads);
   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
+  //printf("PTeamSize=%i\n", PTeamSize);
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
@@ -184,7 +196,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // set, but they do not have individual ThreadStates yet. If they ever
   // modify the ICVs beyond this point a ThreadStates will be allocated.
 
-  bool IsActiveParallelRegion = NumThreads > 1;
+  bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1;
   if (!IsActiveParallelRegion) {
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
     invokeMicrotask(TId, 0, fn, args, nargs);
@@ -253,12 +265,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   }
 
   {
+    //printf("Generic execution\n");
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
+    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+    //                                 1u, TId == 0, ident,
+    //                                 /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
                                           (void *)nullptr, true, ident,
                                           /*ForceTeamState=*/true);
@@ -287,7 +303,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
   // Set to true for workers participating in the parallel region.
   uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
+  bool ThreadIsActive = TId < state::getEffectivePTeamSize()*mapping::getSimdLen();
   return ThreadIsActive;
 }
 
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c83..2bfec6dcd6faec 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -164,9 +164,57 @@ uint32_t roundToWarpsize(uint32_t s) {
 
 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 
+static int32_t nvptx_simd_reduce_nowait(void *reduce_data,
+                                            ShuffleReductFnTy shflFct,
+                                            InterWarpCopyFnTy cpyFct) {
+  uint32_t SimdId = mapping::getSimdLane();
+  uint32_t NumThreads = mapping::getSimdLen();
+  if(NumThreads == 1)
+    return 1;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  if (NumThreads == mapping::getWarpSize())
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else 
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads,
+                              /*LaneId=*/mapping::getSimdLane());
+#else
+  __kmpc_impl_lanemask_t Liveness = mapping::simdmask();
+  if (Liveness == lanes::All) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else 
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/utils::popc(Liveness),
+                              /*LaneId=*/mapping::getSimdLane());
+#endif
+
+  return mapping::isSimdLeader();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
 } // namespace
 
 extern "C" {
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+                                           uint64_t reduce_data_size,
+                                           void *reduce_data,
+                                           ShuffleReductFnTy shflFct,
+                                           InterWarpCopyFnTy cpyFct) {
+  return nvptx_simd_reduce_nowait(reduce_data, shflFct, cpyFct);
+}
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index a1e4fa2449d9a2..ba8e9a982dae74 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -364,13 +364,18 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 }
 
 int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  //return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  return returnValIfLevelIsActive(Level, mapping::getSimdGroup(), 0);
 }
 
 int omp_get_thread_num(void) {
   return omp_get_ancestor_thread_num(omp_get_level());
 }
 
+int omp_get_simd_lane(void) {
+  return mapping::getSimdLane();
+}
+
 int omp_get_team_size(int Level) {
   return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 }
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcdd..1d601657d391b5 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -529,6 +529,10 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
   impl::namedBarrier();
 }
 
+void __kmpc_simd_barrier(void) {
+  synchronize::warp(mapping::simdmask());
+}
+
 [[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
   synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
 }
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..658c26c24a3a83 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -468,6 +468,28 @@ static void popDST() {
   ThreadDSTPtr = OldDST;
 }
 
+template<typename IType>
+void SimdLoop(
+  IdentTy *ident, void *WorkFn, IType TripCount,
+  void **Args
+) {
+  ASSERT(WorkFn, "expected valid outlined function"); 
+  __kmpc_impl_lanemask_t SimdMask = mapping::simdmask();
+  uint32_t Step = mapping::getSimdLen();
+
+  //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n",
+  //       mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount);
+
+  synchronize::warp(SimdMask);
+  for(IType omp_iv = (IType) mapping::getSimdLane();
+      omp_iv < TripCount;
+      omp_iv += Step
+  ) {
+    ((void (*)(IType, void**))WorkFn)(omp_iv, Args);
+  }
+  synchronize::warp(SimdMask);
+}
+
 extern "C" {
 
 // init
@@ -634,6 +656,28 @@ void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+
+void __kmpc_simd_4u(
+  IdentTy *ident, void *WorkFn, uint32_t TripCount,
+  void **Args
+) {
+  SimdLoop<uint32_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd_8u(
+  IdentTy *ident, void *WorkFn, uint64_t TripCount,
+  void **Args
+) {
+  SimdLoop<uint64_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd(
+  IdentTy *ident, void *WorkFn, void **Args, uint32_t nargs
+) {
+  ASSERT(WorkFn, "expected valid outlined function"); 
+  ((void (*)(void**))WorkFn)(Args);
+}
+
 }
 
 namespace ompx {