[clang] [llvm] OpenMP offload 'simd' directive (PR #91261)

Fri Oct 18 08:56:17 PDT 2024

https://github.com/efwright updated https://github.com/llvm/llvm-project/pull/91261

>From 4b76d56f38baf86f6b65ef7e610ad266ba3d69b1 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzansel61.coral.llnl.gov>
Date: Mon, 6 May 2024 12:20:44 -0700
Subject: [PATCH 1/5] OpenMP offload 'simd' directive

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   2 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   8 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            | 185 +++--
 clang/lib/CodeGen/CodeGenFunction.cpp         |   2 +-
 .../target_teams_generic_loop_codegen.cpp     |  18 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  27 +-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  12 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 655 +++++++++++++++++-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  13 +-
 offload/DeviceRTL/include/Interface.h         |  11 +
 offload/DeviceRTL/include/Mapping.h           |   7 +
 offload/DeviceRTL/src/Kernel.cpp              |   4 +-
 offload/DeviceRTL/src/Mapping.cpp             |  34 +
 offload/DeviceRTL/src/Parallelism.cpp         |  25 +-
 offload/DeviceRTL/src/Reduction.cpp           |  48 ++
 offload/DeviceRTL/src/State.cpp               |   7 +-
 offload/DeviceRTL/src/Synchronization.cpp     |   4 +
 offload/DeviceRTL/src/Workshare.cpp           |  44 ++
 18 files changed, 1023 insertions(+), 83 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3747b00d4893ad..836253ab1a7d8b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1035,6 +1035,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
     : CGM(CGM), OMPBuilder(CGM.getModule()) {
+
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
   llvm::OpenMPIRBuilderConfig Config(
       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
@@ -1056,6 +1057,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
 }
 
 void CGOpenMPRuntime::clear() {
+
   InternalVars.clear();
   // Clean non-target variable declarations possibly used only in debug info.
   for (const auto &Data : EmittedNonTargetVariables) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 35ff75416cb776..16aff085579807 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -262,6 +262,7 @@ class CheckVarsEscapingDeclContext final
                                bool IsCombinedParallelRegion) {
     if (!S)
       return;
+
     for (const CapturedStmt::Capture &C : S->captures()) {
       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
         const ValueDecl *VD = C.getCapturedVar();
@@ -336,13 +337,15 @@ class CheckVarsEscapingDeclContext final
       return;
     if (!D->hasAssociatedStmt())
       return;
+
     if (const auto *S =
             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
       // Do not analyze directives that do not actually require capturing,
       // like `omp for` or `omp simd` directives.
       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
-      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
+      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown &&
+          D->getDirectiveKind() != OMPD_simd) {
         VisitStmt(S->getCapturedStmt());
         return;
       }
@@ -1661,6 +1664,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
   bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
+  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
 
   ASTContext &C = CGM.getContext();
 
@@ -1755,7 +1759,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
 
   CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
       OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
-      DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+      DistributeReduction, SimdReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
       CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
       RTLoc));
   return;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 71a27d0c6bc1fb..b4e699c1d003b8 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1454,6 +1454,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     }
 
     const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
+llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n";
     EmitVarDecl(*VD);
     EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD),
                       /*Volatile=*/false, TaskRedRef->getType());
@@ -1494,7 +1495,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
                       isOpenMPParallelDirective(EKind) ||
                       TeamsLoopCanBeParallel || ReductionKind == OMPD_simd;
-    bool SimpleReduction = ReductionKind == OMPD_simd;
+    bool SimpleReduction = (CGM.getLangOpts().OpenMPIsTargetDevice ? false : ReductionKind == OMPD_simd);
     // Emit nowait reduction if nowait clause is present or directive is a
     // parallel directive (it always has implicit barrier).
     CGM.getOpenMPRuntime().emitReduction(
@@ -2736,59 +2737,139 @@ GetAlignedMapping(const OMPLoopDirective &S, CodeGenFunction &CGF) {
 // available for "loop bind(thread)", which maps to "simd".
 static void emitOMPSimdDirective(const OMPLoopDirective &S,
                                  CodeGenFunction &CGF, CodeGenModule &CGM) {
-  bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSimdSupportedByOpenMPIRBuilder(S);
-  if (UseOMPIRBuilder) {
-    auto &&CodeGenIRBuilder = [&S, &CGM, UseOMPIRBuilder](CodeGenFunction &CGF,
-                                                          PrePostActionTy &) {
-      // Use the OpenMPIRBuilder if enabled.
-      if (UseOMPIRBuilder) {
-        llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars =
-            GetAlignedMapping(S, CGF);
-        // Emit the associated statement and get its loop representation.
-        const Stmt *Inner = S.getRawStmt();
-        llvm::CanonicalLoopInfo *CLI =
-            CGF.EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
-
-        llvm::OpenMPIRBuilder &OMPBuilder =
-            CGM.getOpenMPRuntime().getOMPBuilder();
-        // Add SIMD specific metadata
-        llvm::ConstantInt *Simdlen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
-          RValue Len = CGF.EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
-                                       /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Simdlen = Val;
-        }
-        llvm::ConstantInt *Safelen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
-          RValue Len = CGF.EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
-                                       /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Safelen = Val;
-        }
-        llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
-        if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
-          if (C->getKind() == OpenMPOrderClauseKind::OMPC_ORDER_concurrent) {
-            Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
-          }
-        }
-        // Add simd metadata to the collapsed loop. Do not generate
-        // another loop for if clause. Support for if clause is done earlier.
-        OMPBuilder.applySimd(CLI, AlignedVars,
-                             /*IfCond*/ nullptr, Order, Simdlen, Safelen);
-        return;
-      }
+  bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsTargetDevice;
+  if(UseOMPIRBuilder) {
+    auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt());
+    auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt());
+    CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
+
+    CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+    llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+      AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+
+    llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+
+    using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+    // Callback function for generating the trip count of the loop.
+    // This function should assign values to the TripCount and Signed variables
+    llvm::Value *LoopVar;
+    std::string LoopVarName;
+    EmittedClosureTy LoopVarClosure;
+
+    auto DistanceCB = [&](llvm::BasicBlock *AllocaBB,
+                          InsertPointTy CodeGenIP) -> llvm::Value* {
+      InsertPointTy AllocaIP(AllocaBB, AllocaBB->getTerminator()->getIterator());
+      OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+        *this, AllocaIP, *(CodeGenIP.getBlock()));
+      Builder.restoreIP(CodeGenIP);
+
+      // Emit the loop variable, needed for the distance func
+      const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt());
+      if(const Stmt *InitStmt = For->getInit())
+        EmitStmt(InitStmt);
+
+      auto *LoopVarRef = CL->getLoopVarRef();
+      LValue LCVal = EmitLValue(LoopVarRef);
+      //Address LoopVarAddress = LCVal.getAddress(*this);
+      //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer());
+      LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this));
+      LoopVarName = LoopVarRef->getNameInfo().getAsString();
+
+      // Emit the distance func from the CanonicalLoop
+      const CapturedStmt *DistanceFunc = CL->getDistanceFunc();
+      EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+
+      // Load the output and store it in the TripCount
+      QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+
+      //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+      RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+ 
+      emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+      auto *TripCount = Builder.CreateLoad(CountAddr, ".count");
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+      LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+      return TripCount;
     };
-    {
-      auto LPCRegion =
-          CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
-      OMPLexicalScope Scope(CGF, S, OMPD_unknown);
-      CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd,
-                                                  CodeGenIRBuilder);
-    }
+
+    auto FiniCB = [this](InsertPointTy IP) {
+      OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
+    };
+
+    auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                     llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) {
+      ReplVal = &Val;
+      return CodeGenIP;
+    };
+
+    auto BodyGenCB = [&]
+                     (//InsertPointTy OuterAllocaIP,
+                      llvm::BasicBlock *OuterAllocaBB,
+                      InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      InsertPointTy Prolog, InsertPointTy ReductionEpilog,
+                      llvm::Value *Virtual) {
+
+      llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+      InsertPointTy OuterAllocaIP(OuterAllocaBB, OuterAllocaBB->getTerminator()->getIterator());
+
+      OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+        *this, OuterAllocaIP, *(Prolog.getBlock()));
+      Builder.restoreIP(Prolog);
+
+      OMPPrivateScope PrivateScope(*this);
+      EmitOMPFirstprivateClause(S, PrivateScope);
+      EmitOMPPrivateClause(S, PrivateScope);
+      EmitOMPReductionClauseInit(S, PrivateScope);
+      PrivateScope.Privatize();
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+
+      Builder.restoreIP(CodeGenIP);
+      emitCapturedStmtCall(*this, LoopVarClosure,
+                           {LoopVar, Virtual});
+
+      // Generate the body of the loop
+      OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+          *this,
+          S.getBody(),
+          AllocaIP,
+          CodeGenIP,
+          "simd");
+
+       llvm::BasicBlock *RedEpilogBB = ReductionEpilog.getBlock();
+       llvm::Instruction *RedEpilogTerminator = RedEpilogBB->getTerminator();
+       llvm::BasicBlock *FinalBlock = RedEpilogBB->getSingleSuccessor();
+
+       Builder.restoreIP(ReductionEpilog);
+       EmitOMPReductionClauseFinal(S, OMPD_simd);
+
+       llvm::BasicBlock *ReductionThenBB = Builder.GetInsertBlock();
+
+       if(!(ReductionThenBB->getTerminator())) {
+         RedEpilogTerminator->eraseFromParent();
+         Builder.CreateBr(FinalBlock);
+       }
+
+    };
+
+    Builder.restoreIP(
+      OMPBuilder.createSimdLoop(
+        Builder,
+        AllocaIP,
+        BodyGenCB,
+        DistanceCB,
+        PrivCB,
+        FiniCB
+    ));
+
     return;
-  }
+  } 
 
   CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S);
   CGF.OMPFirstScanLoop = true;
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 2306043c90f406..4e3350db14b1d2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -102,7 +102,7 @@ CodeGenFunction::~CodeGenFunction() {
   // seems to be a reasonable spot. We do it here, as opposed to the deletion
   // time of the CodeGenModule, because we have to ensure the IR has not yet
   // been "emitted" to the outside, thus, modifications are still sensible.
-  if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+  if ((CGM.getLangOpts().OpenMPIsTargetDevice || CGM.getLangOpts().OpenMPIRBuilder) && CurFn)
     CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn);
 }
 
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index e05b3209f9eff2..4194bdec549dd4 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -481,6 +481,7 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
@@ -498,7 +499,6 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // IR-GPU:       body:
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -513,12 +513,11 @@ int foo() {
 // IR-GPU:       else:
 // IR-GPU-NEXT:    br label [[IFCONT]]
 // IR-GPU:       ifcont:
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// IR-GPU:       then3:
+// IR-GPU:       then2:
 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -526,9 +525,9 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
-// IR-GPU:       else4:
+// IR-GPU:       else3:
 // IR-GPU-NEXT:    br label [[IFCONT4]]
-// IR-GPU:       ifcont5:
+// IR-GPU:       ifcont4:
 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
 // IR-GPU-NEXT:    br label [[PRECOND]]
@@ -627,6 +626,7 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
@@ -644,7 +644,6 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // IR-GPU:       body:
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
@@ -659,12 +658,11 @@ int foo() {
 // IR-GPU:       else:
 // IR-GPU-NEXT:    br label [[IFCONT]]
 // IR-GPU:       ifcont:
-// IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
-// IR-GPU:       then3:
+// IR-GPU:       then2:
 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
@@ -672,9 +670,9 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
-// IR-GPU:       else4:
+// IR-GPU:       else3:
 // IR-GPU-NEXT:    br label [[IFCONT4]]
-// IR-GPU:       ifcont5:
+// IR-GPU:       ifcont4:
 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
 // IR-GPU-NEXT:    br label [[PRECOND]]
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8834c3b1f50115..82041a7b2a03fb 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -285,7 +285,7 @@ class OffloadEntriesInfoManager {
   /// Return true if a there are no entries defined.
   bool empty() const;
   /// Return number of entries defined so far.
-  unsigned size() const { return OffloadingEntriesNum; }
+  unsigned size() const { return OffloadingEntriesNum /*OffloadEntriesTargetRegion.size()*/ /*OffloadingEntriesNum*/; }
 
   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
 
@@ -514,6 +514,11 @@ class OpenMPIRBuilder {
   ///                              all functions are finalized.
   void finalize(Function *Fn = nullptr);
 
+  CallInst *globalizeAlloca(AllocaInst *Alloca, SmallVector<Instruction*, 32>&);
+  void globalizeParallelVars(Function *CurFn);
+  SmallPtrSet<Value*, 32> VarsNeedingGlobalization;
+  void globalizeVars(Function *CurFn);
+
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
 
@@ -592,6 +597,18 @@ class OpenMPIRBuilder {
   using BodyGenCallbackTy =
       function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
+  using LoopBodyCallbackTy =
+      function_ref<void(
+        BasicBlock *OuterAllocaBB, InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+        InsertPointTy PrologIP, InsertPointTy ReductionEpilogIP,
+        Value *IterationNum
+      )>;
+
+  using TripCountCallbackTy =
+      function_ref<
+        Value*(llvm::BasicBlock *AllocaBB, InsertPointTy CodeGenIP)
+      >;
+
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
   // function_ref class - function_ref contains non-ownable reference
@@ -672,6 +689,13 @@ class OpenMPIRBuilder {
   InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
                              omp::Directive CanceledDirective);
 
+  IRBuilder<>::InsertPoint
+  createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP,
+                 LoopBodyCallbackTy BodyGenCB,
+                 TripCountCallbackTy DistanceCB,
+                 PrivatizeCallbackTy PrivCB,
+                 FinalizeCallbackTy FiniCB);
+
   /// Generator for '#omp parallel'
   ///
   /// \param Loc The insert and source location description.
@@ -1876,6 +1900,7 @@ class OpenMPIRBuilder {
       InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
       bool IsNoWait = false, bool IsTeamsReduction = false,
       bool HasDistribute = false,
+      bool IsSimdReduction = false,
       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
       std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
       Value *SrcLocInfo = nullptr);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d8f3c8fa06b747..81dc9299207693 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -124,6 +124,9 @@ __OMP_FUNCTION_TYPE(ShuffleReduce, false, Void, VoidPtr, Int16, Int16, Int16)
 __OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32)
 __OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr)
 
+__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr)
+__OMP_FUNCTION_TYPE(SimdTask, false, Void, VoidPtrPtr)
+
 #undef __OMP_FUNCTION_TYPE
 #undef OMP_FUNCTION_TYPE
 
@@ -204,6 +207,7 @@ __ICV_RT_GET(proc_bind, omp_get_proc_bind)
 
 
 __OMP_RTL(__kmpc_barrier, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_simd_barrier, false, Void, )
 __OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32)
 __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_error, false, Void, IdentPtr, Int32, Int8Ptr)
@@ -227,6 +231,7 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )
 
 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_simd_lane, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
@@ -484,6 +489,8 @@ __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
+__OMP_RTL(__kmpc_nvptx_simd_reduce_nowait_v2, false, Int32, IdentPtr,
+	  Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
 __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr,
 	  Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
 __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr,
@@ -509,6 +516,10 @@ __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 __OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
 __OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
 
+__OMP_RTL(__kmpc_simd_4u, false, Void, IdentPtr, LoopTaskPtr, Int32, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd_8u, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr)
+__OMP_RTL(__kmpc_simd, false, Void, IdentPtr, SimdTaskPtr, VoidPtrPtr, Int32)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
@@ -715,6 +726,7 @@ __OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, ZExt, Par
 __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
 
 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_simd_lane, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 172812a3802d33..1d5b24475d1d1b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -150,6 +150,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
+Function *GLOBAL_ReductionFunc = nullptr;
+
 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   if (T.isAMDGPU()) {
     StringRef Features =
@@ -798,6 +800,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
   for (Function *F : ConstantAllocaRaiseCandidates)
     raiseUserConstantDataAllocasToEntryBlock(Builder, F);
 
+  //globalizeVars(Fn);
+
   EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
       [](EmitMetadataErrorKind Kind,
          const TargetRegionEntryInfo &EntryInfo) -> void {
@@ -806,7 +810,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
               "OMPIRBuilder finalization \n";
   };
 
-  if (!OffloadInfoManager.empty())
+  if (!OffloadInfoManager.empty()) 
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
 
   if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
@@ -814,6 +818,159 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
         M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
     emitUsed("llvm.compiler.used", LLVMCompilerUsed);
   }
+
+}
+
+CallInst * OpenMPIRBuilder::globalizeAlloca(
+  AllocaInst *Alloca,
+  SmallVector<Instruction*, 32> &ToBeDeleted
+) {
+  FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr(
+    OMPRTL___kmpc_alloc_shared
+  );
+
+  Builder.SetInsertPoint(Alloca);
+  Value *SharedAllocArgs[] = {
+    //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8)
+
+    //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout()));
+    //ConstantExpr::getSizeOf(Alloca->getAllocatedType())
+    ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue())
+  };
+
+  CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1));
+  AllocSharedCall->setName(Alloca->getName() + "_on_stack");
+  //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack");
+
+  dbgs() << "Created " << *AllocSharedCall << "\n";
+  dbgs() << *(Alloca->getType()) << "\n";
+  dbgs() << *(AllocSharedCall->getType()) << "\n";
+
+  //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0);
+  //dbgs() << " " << *CastType << "\n";
+  //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast(
+  //  AllocSharedCall, CastType, Alloca->getName()+"_on_stack"
+  //);
+
+  //dbgs() << " Casted " << *CastedSharedAlloc << "\n";
+
+  //Alloca->replaceAllUsesWith(AllocSharedCall);
+
+  // If the Alloca was allocated in address space 5 (local) we need to
+  // account for a type mismatch between it and the return from __kmpc_shared_alloc
+
+  for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) {
+    dbgs () << " User - " << *(*U) << "\n";
+  }
+
+  if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) {
+    auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back());
+    dbgs() << *(AddrSpaceCast->getType()) << "\n";
+    AddrSpaceCast->replaceAllUsesWith(AllocSharedCall);
+    //AddrSpaceCast->removeFromParent();
+    ToBeDeleted.push_back(AddrSpaceCast);
+  } else {
+    Alloca->replaceAllUsesWith(AllocSharedCall);
+  }
+  ToBeDeleted.push_back(Alloca);
+  //Alloca->removeFromParent();
+
+  //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) {
+  //  if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) {
+  //    if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) {
+  //      AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc);
+  //      AddrSpaceCast->removeFromParent();
+  //    }
+  //  }
+  //}
+
+  //Alloca->removeFromParent();
+
+  dbgs() << "  var globalized!\n";
+
+  return AllocSharedCall;
+
+}
+
+void OpenMPIRBuilder::globalizeParallelVars(
+  llvm::Function *CurFn
+) {
+  SmallVector<Instruction*, 32> ToBeDeleted;
+  std::stack<CallInst*> GlobalizedVars;
+
+  dbgs() << "  Exploring: " << CurFn->getName() << "\n";
+  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+  {
+    for(auto I = BB->begin(); I != BB->end(); I++)
+    {
+      if(auto Alloca = dyn_cast<AllocaInst>(I)) {
+        dbgs() << "    Found Alloca: " << *Alloca << "\n";
+        CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted);
+        GlobalizedVars.push(GlobalizedAlloca);
+      } else if(auto FnCall = dyn_cast<CallInst>(I)) {
+        dbgs() << "    Found Function Call: " << *FnCall << "\n";
+      }
+    }
+  }
+
+  BasicBlock &EndBlock = CurFn->back();
+  Builder.SetInsertPoint(EndBlock.begin());
+  while(!GlobalizedVars.empty()) {
+    CallInst *SharedAlloc = GlobalizedVars.top();
+    GlobalizedVars.pop();
+    FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr(
+      OMPRTL___kmpc_free_shared
+    );
+
+    Value *SharedFreeArgs[] = {
+      SharedAlloc,
+      SharedAlloc->getArgOperand(0)
+    };
+
+    CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2));
+    dbgs() << " Freed - " << *SharedFreeCall << "\n";
+  }
+
+  for(auto I : ToBeDeleted)
+    I->removeFromParent();
+
+}
+
+// Globalize any variables that are needed in a lower level of
+// the parallel hierarchy.
+// Only Vars used in 'simd' regions are supported right now.
+void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn)
+{
+
+  std::stack<llvm::AllocaInst> Allocas;
+  SmallPtrSet<AllocaInst*, 32> EscapedVars;
+
+  //dbgs() << "Function: " << CurFn->getName() << "\n";
+
+  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
+  {
+    for(auto I = BB->begin(); I != BB->end(); I++)
+    {
+      //dbgs() << "  Instruction: " << *I << "\n";
+      if(auto FnCall = dyn_cast<CallInst>(I))
+      {
+        //dbgs() << "    Found call: " << *FnCall << "\n";
+        if(auto Fn = FnCall->getCalledFunction()) {
+          //dbgs() << "      " << Fn->getName() << "\n";
+          if(Fn->getName() == "__kmpc_parallel_51") {
+            //dbgs() << "        Parallel!\n";
+            
+            Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5));
+            assert(OutlinedFn && "failed to find GPU parallel outlined fn");
+
+
+            dbgs() << "Found a parallel region\n";
+            globalizeParallelVars(OutlinedFn);
+          }
+        }
+      }
+    }
+  }
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -975,9 +1132,11 @@ OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  Value *Args[] = {
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
-      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
+  if (!ThreadID)
+    ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+                   ThreadID};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -1355,6 +1514,467 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
   }
 }
 
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+  LoopBodyCallbackTy BodyGenCB,
+  TripCountCallbackTy DistanceCB,
+  PrivatizeCallbackTy PrivCB,
+  FinalizeCallbackTy FiniCB
+)
+{
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  BasicBlock *InsertBB = Builder.GetInsertBlock();
+  Function *OuterFn = InsertBB->getParent();
+
+  LLVM_DEBUG(dbgs() << "At the start of createSimdLoop:\n" << *OuterFn << "\n");
+
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+  // Vector to remember instructions we used only during the modeling but which
+  // we want to delete at the end.
+  SmallVector<Instruction *, 16> ToBeDeleted;
+
+  // Create an artificial insertion point that will also ensure the blocks we
+  // are about to split are not degenerated.
+  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+  Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+  BasicBlock *ThenBB = ThenTI->getParent();
+
+  // Alloca block for simd
+  BasicBlock *EntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.simd.entry");
+
+  // Block for setup related to simd
+  // i.e variable privatizaiton, trip count, reductions
+  BasicBlock *PrologBB = EntryBB->splitBasicBlock(ThenTI, "omp.simd.prolog");
+
+  // Entry block for the outlined loop body
+  // Allocas from the loop body should be done here
+  BasicBlock *LoopEntryBB = PrologBB->splitBasicBlock(ThenTI, "omp.simd.loop.entry");
+
+  // Block for generating the loop body
+  BasicBlock *LoopBodyBB = LoopEntryBB->splitBasicBlock(ThenTI, "omp.simd.loop.body");
+
+  BasicBlock *LoopPreFiniBB =
+    LoopBodyBB->splitBasicBlock(ThenTI, "omp.simd.loop.pre_finalize");
+
+  BasicBlock *LoopExitBB =
+    LoopPreFiniBB->splitBasicBlock(ThenTI, "omp.simd.loop.outlined.exit");
+
+  // Block for finalizing any reductions
+  BasicBlock *ReductionEpilogBB =
+    LoopExitBB->splitBasicBlock(ThenTI, "omp.reduction.epilog");
+
+  BasicBlock *FinalizeBB =
+    ReductionEpilogBB->splitBasicBlock(ThenTI, "omp.simd.finalize");
+
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    // Hide "open-ended" blocks from the given FiniCB by setting the right jump
+    // target to the region exit blocks
+    if (IP.getBlock()->end() == IP.getPoint()) {
+      IRBuilder<>::InsertPointGuard IPG(Builder);
+      Builder.restoreIP(IP);
+      Instruction *I = Builder.CreateBr(FinalizeBB); //PRegExitBB);
+      IP = InsertPointTy(I->getParent(), I->getIterator());
+    }
+    assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
+           IP.getBlock()->getTerminator()->getSuccessor(0) == FinalizeBB && //PRegExitBB &&
+           "Unexpected insertion point for finalization call!");
+    return FiniCB(IP);
+  };
+
+  FinalizationStack.push_back({FiniCBWrapper, OMPD_simd, false});
+
+  // Compute the loop trip count
+  // Insert after the outer alloca to ensure all variables needed
+  // in its calculation are ready
+  
+  InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
+  assert(DistanceCB && "expected loop trip count callback function!");
+  Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+  assert(DistVal && "trip count call back should return integer trip count");
+  Type *DistValType = DistVal->getType();
+  assert(DistValType->isIntegerTy() && "trip count should be integer type");
+
+  LLVM_DEBUG(dbgs() << "After DistanceCB:\n" << *PrologBB << "\n");
+  LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n");
+
+  // Create the virtual iteration variable that will be pulled into
+  // the outlined function.
+  //Builder.restoreIP(OuterAllocaIP);
+  Builder.SetInsertPoint(EntryBB, EntryBB->begin());
+  AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp");
+  Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv");
+  //InsertPointTy MidAllocaIP = Builder.saveIP();
+
+  // Generate the privatization allocas in the block that will become the entry
+  // of the outlined function.
+//  Builder.SetInsertPoint(LoopEntryBB->getTerminator());
+  Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin());
+  // Use omp.iv in the outlined region so it gets captured during the outline
+  Instruction *OMPIVUse = dyn_cast<Instruction>(
+    Builder.CreateAdd(OMPIV, OMPIV, "omp.iv.tobedeleted"));
+  InsertPointTy InnerAllocaIP = Builder.saveIP();
+
+  // All of the temporary omp.iv variables need to be deleted later
+  // Order matters
+  ToBeDeleted.push_back(OMPIVUse);
+  ToBeDeleted.push_back(OMPIV);
+  ToBeDeleted.push_back(OMPIVAlloca);
+
+  LLVM_DEBUG(dbgs() << "omp.iv variable generated:\n" << *OuterFn << "\n");
+
+  LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n");
+  assert(BodyGenCB && "Expected body generation callback!");
+  InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin());
+
+  InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator());
+  InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin());
+
+  // Generate the body of the loop. The omp.iv variable is a value between 
+  // 0 <= omp.iv < TripCount
+  // If a loop variable is needed, then this callback function can initialize
+  // it based on the omp.iv.
+  BodyGenCB(EntryBB, InnerAllocaIP, CodeGenIP, PrologIP, ReductionEpilogIP, OMPIV);
+
+  LLVM_DEBUG(dbgs() << "After body codegen:\n" << *OuterFn << "\n");
+
+  // Determine what runtime function should be called based on the type
+  // of the trip count
+  //FunctionCallee RTLFn; 
+
+  // Outline 1
+  {
+    OutlineInfo OI;
+
+    // Adjust the finalization stack, verify the adjustment, and call the
+    // finalize function a last time to finalize values between the pre-fini
+    // block and the exit block if we left the parallel "the normal way".
+    //auto FiniInfo = FinalizationStack.pop_back_val();
+    //(void)FiniInfo;
+    //assert(FiniInfo.DK == OMPD_simd && 
+    //       "Unexpected finalization stack state!");
+
+    Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator();
+
+    InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator());
+    FiniCB(PreFiniIP);
+
+    OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock;
+    OI.EntryBB = LoopEntryBB;
+    OI.ExitBB = LoopExitBB;
+
+    SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+    SmallVector<BasicBlock *, 32> Blocks;
+    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+    // Ensure a single exit node for the outlined region by creating one.
+    // We might have multiple incoming edges to the exit now due to finalizations,
+    // e.g., cancel calls that cause the control flow to leave the region.
+    //BasicBlock *PRegOutlinedExitBB = PRegExitBB;
+    //PRegExitBB = LRegExitBB;
+    //PRegOutlinedExitBB->setName("omp.loop.outlined.exit");
+
+    Blocks.push_back(LoopExitBB);
+
+    CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                            /* AggregateArgs */ true,
+                            /* BlockFrequencyInfo */ nullptr,
+                            /* BranchProbabilityInfo */ nullptr,
+                            /* AssumptionCache */ nullptr,
+                            /* AllowVarArgs */ false,
+                            /* AllowAlloca */ true,
+                            /* AllocationBlock */ EntryBB, //OuterAllocaBlock,
+                            /* Suffix */ ".omp_simd");
+
+    BasicBlock *CommonExit = nullptr;
+    SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+    Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+    Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+    auto PrivHelper = [&](Value &V) {
+      // Exclude omp.iv from aggregate
+      if (&V == OMPIV) {
+        OI.ExcludeArgsFromAggregate.push_back(&V);
+        return;
+      }
+
+      // Get all uses of value that are inside of the outlined region
+      SetVector<Use *> Uses;
+      for (Use &U : V.uses())
+        if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+          if (ParallelRegionBlockSet.count(UserI->getParent()))
+            Uses.insert(&U);
+
+      Value *Inner = &V;
+
+      // If the value isn't a pointer type, store it in a pointer
+      // Unpack it inside the outlined region
+      if (!V.getType()->isPointerTy()) {
+        IRBuilder<>::InsertPointGuard Guard(Builder);
+        LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+        Builder.restoreIP(OuterAllocaIP);
+        Value *Ptr = Builder.CreateAlloca(
+          V.getType(), nullptr, V.getName() + ".reloaded");
+
+        // Store to stack at end of the block that currently branches to the entry
+        // block of the to-be-outlined region.
+        Builder.SetInsertPoint(
+          InsertBB, InsertBB->getTerminator()->getIterator());
+        Builder.CreateStore(&V, Ptr);
+
+        // Load back next to allocations in the to-be-outlined region.
+        Builder.restoreIP(InnerAllocaIP);
+        Inner = Builder.CreateLoad(V.getType(), Ptr);
+      }
+
+      Value *ReplacementValue = nullptr;
+      Builder.restoreIP(
+        PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+      assert(ReplacementValue &&
+        "Expected copy/create callback to set replacement value!");
+      if (ReplacementValue == &V)
+        return;
+
+      for (Use *UPtr : Uses)
+        UPtr->set(ReplacementValue);
+
+    };
+
+    LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+    InnerAllocaIP = IRBuilder<>::InsertPoint(
+        OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+    // Reset the outer alloca insertion point to the entry of the relevant block
+    // in case it was invalidated.
+    OuterAllocaIP = IRBuilder<>::InsertPoint(
+      OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+    for (Value *Input : Inputs) {
+      PrivHelper(*Input);
+    }
+
+    assert(Outputs.empty() &&
+      "OpenMP outlining should not produce live-out values!");
+
+    LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
+    for (auto *BB : Blocks) {
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+    }
+
+    int NumInputs = Inputs.size()-1; // One argument is always omp.iv
+    OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+      OutlinedFn.addFnAttr(Attribute::NoUnwind);
+      OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+      assert(OutlinedFn.arg_size() == 2 &&
+             "Expected omp.iv & structArg as arguments");
+
+      CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+      BasicBlock *CallBlock = CI->getParent();
+      CallBlock->setName("omp_loop");
+      Builder.SetInsertPoint(CI);
+
+      Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv
+
+      Value *SimdArgs[] = {
+          Ident,
+          Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr),
+          DistVal,
+          Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr)};
+
+      SmallVector<Value *, 16> RealArgs;
+      RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+      FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+        (DistValType->isIntegerTy(32) ? OMPRTL___kmpc_simd_4u :
+                                        OMPRTL___kmpc_simd_8u));
+      Builder.CreateCall(RTLFn, RealArgs);
+
+      LLVM_DEBUG(dbgs() << "With kmpc_simd_4u call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+      CI->eraseFromParent();
+
+      for (Instruction *I : ToBeDeleted)
+        I->eraseFromParent();
+
+    };
+
+    addOutlineInfo(std::move(OI));
+  }
+
+
+// Outline 2
+  if(false) { // if(!SPMD) {
+    OutlineInfo OI;
+
+    OI.OuterAllocaBB = OuterAllocaBlock;
+    OI.EntryBB = EntryBB; //LoopEntryBB;
+    OI.ExitBB = FinalizeBB; //LoopExitBB;
+
+    SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+    SmallVector<BasicBlock *, 32> Blocks;
+    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+    CodeExtractorAnalysisCache CEAC(*OuterFn);
+
+    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                            /* AggregateArgs */ true,
+                            /* BlockFrequencyInfo */ nullptr,
+                            /* BranchProbabilityInfo */ nullptr,
+                            /* AssumptionCache */ nullptr,
+                            /* AllowVarArgs */ false,
+                            /* AllowAlloca */ true,
+                            /* AllocationBlock */ OuterAllocaBlock,
+                            /* Suffix */ ".omp_simd");
+
+    BasicBlock *CommonExit = nullptr;
+    SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+    Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+    Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+    auto PrivHelper = [&](Value &V) {
+      // Exclude omp.iv from aggregate
+      //if (&V == OMPIV) {
+      //  OI.ExcludeArgsFromAggregate.push_back(&V);
+      //  return;
+      //}
+
+      // Get all uses of value that are inside of the outlined region
+      SetVector<Use *> Uses;
+      for (Use &U : V.uses())
+        if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+          if (ParallelRegionBlockSet.count(UserI->getParent()))
+            Uses.insert(&U);
+
+      Value *Inner = &V;
+
+      // If the value isn't a pointer type, store it in a pointer
+      // Unpack it inside the outlined region
+      if (!V.getType()->isPointerTy()) {
+        IRBuilder<>::InsertPointGuard Guard(Builder);
+        LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+        Builder.restoreIP(OuterAllocaIP);
+        Value *Ptr = Builder.CreateAlloca(
+          V.getType(), nullptr, V.getName() + ".reloaded");
+
+        // Store to stack at end of the block that currently branches to the entry
+        // block of the to-be-outlined region.
+        Builder.SetInsertPoint(
+          InsertBB, InsertBB->getTerminator()->getIterator());
+        Builder.CreateStore(&V, Ptr);
+
+        // Load back next to allocations in the to-be-outlined region.
+        Builder.restoreIP(InnerAllocaIP);
+        Inner = Builder.CreateLoad(V.getType(), Ptr);
+      }
+
+      Value *ReplacementValue = nullptr;
+      Builder.restoreIP(
+        PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
+      assert(ReplacementValue &&
+        "Expected copy/create callback to set replacement value!");
+      if (ReplacementValue == &V)
+        return;
+
+      for (Use *UPtr : Uses)
+        UPtr->set(ReplacementValue);
+
+    };
+
+    LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
+
+    InnerAllocaIP = IRBuilder<>::InsertPoint(
+        OMPIV->getParent(), OMPIV->getNextNode()->getIterator());
+
+    // Reset the outer alloca insertion point to the entry of the relevant block
+    // in case it was invalidated.
+    OuterAllocaIP = IRBuilder<>::InsertPoint(
+      OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
+    for (Value *Input : Inputs) {
+      PrivHelper(*Input);
+    }
+
+    assert(Outputs.empty() &&
+      "OpenMP outlining should not produce live-out values!");
+
+    LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
+    for (auto *BB : Blocks) {
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+    }
+
+    int NumInputs = Inputs.size();
+
+    OI.PostOutlineCB = [=](Function &OutlinedFn) {
+
+      OutlinedFn.addFnAttr(Attribute::NoUnwind);
+      OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+      assert(OutlinedFn.arg_size() == 1 &&
+             "Expected structArg as arguments");
+
+      CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+      BasicBlock *CallBlock = CI->getParent();
+      CallBlock->setName("omp_simd");
+      Builder.SetInsertPoint(CI);
+
+      Value * StructArg = CI->getArgOperand(0);
+
+      Value *SimdArgs[] = {
+          Ident,
+          Builder.CreateBitCast(&OutlinedFn, SimdTaskPtr),
+          Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr),
+          Builder.getInt32(NumInputs)};
+
+      SmallVector<Value *, 16> RealArgs;
+      RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs));
+
+      FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(
+        OMPRTL___kmpc_simd);
+      Builder.CreateCall(RTLFn, RealArgs);
+
+      LLVM_DEBUG(dbgs() << "With __kmpc_simd call placed: " << *Builder.GetInsertBlock()->getParent() << "\n");
+
+      CI->eraseFromParent();
+
+      for (Instruction *I : ToBeDeleted)
+        I->eraseFromParent();
+
+    };
+
+    addOutlineInfo(std::move(OI));
+  }
+
+
+
+
+
+  InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end());
+  UI->eraseFromParent();
+
+  return AfterIP;
+
+}
+
+
 IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
     const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
     BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
@@ -1652,7 +2272,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
   LLVM_DEBUG({
     for (auto *BB : Blocks)
-      dbgs() << " PBR: " << BB->getName() << "\n";
+      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
   });
 
   // Adjust the finalization stack, verify the adjustment, and call the
@@ -3495,7 +4115,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   }
   Value *ReductionDataSize =
       Builder.getInt64(MaxDataSize * ReductionInfos.size());
-  if (!IsTeamsReduction) {
+  if(IsSimdReduction) {
+    Value *SarFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+    Value *WcFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+    //    RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2);
+    Res = Builder.CreateCall(SimdReduceFn, Args);
+  } else if (!IsTeamsReduction) {
     Value *SarFuncCast =
         Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
     Value *WcFuncCast =
@@ -3616,6 +4247,9 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
   BasicBlock *InsertBlock = Loc.IP.getBlock();
   BasicBlock *ContinuationBlock =
       InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -3656,7 +4290,7 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
   const DataLayout &DL = Module->getDataLayout();
   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
-  Function *ReductionFunc = getFreshReductionFunc(*Module);
+  Function *ReductionFunc = getFreshReductionFunc(M);
   Value *Lock = getOMPCriticalRegionLock(".reduction");
   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -8585,6 +9219,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
       [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
           const TargetRegionEntryInfo &EntryInfo,
           const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
+
         // Generate metadata for target regions. Each entry of this metadata
         // contains:
         // - Entry 0 -> Kind of this type of metadata (0).
@@ -8922,7 +9557,6 @@ void OpenMPIRBuilder::registerTargetGlobalVariable(
     VarSize = M.getDataLayout().getPointerSize();
     Linkage = GlobalValue::WeakAnyLinkage;
   }
-
   OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
                                                       Flags, Linkage);
 }
@@ -9010,6 +9644,7 @@ bool OffloadEntriesInfoManager::empty() const {
 
 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) const {
+
   auto It = OffloadEntriesTargetRegionCount.find(
       getTargetRegionEntryCountKey(EntryInfo));
   if (It == OffloadEntriesTargetRegionCount.end())
@@ -9019,6 +9654,7 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
 
 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) {
+
   OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
       EntryInfo.Count + 1;
 }
@@ -9026,6 +9662,7 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
 /// Initialize target region entry.
 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
     const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
+
   OffloadEntriesTargetRegion[EntryInfo] =
       OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
                                    OMPTargetRegionEntryTargetRegion);
@@ -9035,6 +9672,7 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
     TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
     OMPTargetRegionEntryKind Flags) {
+
   assert(EntryInfo.Count == 0 && "expected default EntryInfo");
 
   // Update the EntryInfo with the next available count for this location.
@@ -9082,6 +9720,7 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
 
 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
     const OffloadTargetRegionEntryInfoActTy &Action) {
+
   // Scan all target region entries and perform the provided action.
   for (const auto &It : OffloadEntriesTargetRegion) {
     Action(It.first, It.second);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 15b26a38cc28ef..0ad2b3a055f6c4 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1194,7 +1194,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   }
 
   StructType *StructArgTy = nullptr;
-  AllocaInst *Struct = nullptr;
+  //AllocaInst *Struct = nullptr;
+  Instruction *Struct = nullptr;
   unsigned NumAggregatedInputs = 0;
   if (AggregateArgs && !StructValues.empty()) {
     std::vector<Type *> ArgTypes;
@@ -1210,12 +1211,16 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
       auto *StructSpaceCast = new AddrSpaceCastInst(
-          Struct, PointerType ::get(Context, 0), "structArg.ascast");
+        Struct, PointerType ::get(Context, 0), "structArg.ascast");
       StructSpaceCast->insertAfter(Struct);
-      params.push_back(StructSpaceCast);
+      // There isn't really a point in generating this cast if you
+      // just aren't going to use it...
+      Struct = StructSpaceCast;
+      //params.push_back(StructSpaceCast);
     } else {
-      params.push_back(Struct);
+      //params.push_back(Struct);
     }
+    params.push_back(Struct);
     // Store aggregated inputs in the struct.
     for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
       if (inputs.contains(StructValues[i])) {
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4f..b970d078fc0ddb 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -167,6 +167,9 @@ double omp_get_wtick(void);
 
 double omp_get_wtime(void);
 ///}
+
+int omp_get_simd_lane(void);
+
 }
 
 extern "C" {
@@ -233,6 +236,12 @@ void __kmpc_target_deinit();
 ///{
 void *__kmpc_reduction_get_fixed_buffer();
 
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+                                           uint64_t reduce_data_size,
+                                           void *reduce_data,
+                                           ShuffleReductFnTy shflFct,
+                                           InterWarpCopyFnTy cpyFct);
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
@@ -257,6 +266,8 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
 
 void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
 
+void __kmpc_simd_barrier(void);
+
 void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
 
 void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2fb87abe5418c0..d81ccff436cc2c 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -105,6 +105,13 @@ uint32_t getMaxTeamThreads(bool IsSPMD);
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();
 
+uint32_t getSimdLen();
+uint32_t getSimdGroup();
+uint32_t getSimdLane();
+bool isSimdLeader();
+uint32_t getNumSimdGroups();
+LaneMaskTy simdmask();
+
 } // namespace mapping
 
 } // namespace ompx
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6..e6c01d1741821a 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -42,9 +42,9 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
 static void genericStateMachine(IdentTy *Ident) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
+
   do {
     ParallelRegionFnTy WorkFn = nullptr;
-
     // Wait for the signal that we have a new work function.
     synchronize::threads(atomic::seq_cst);
 
@@ -100,7 +100,9 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
   }
 
   if (mapping::isInitialThreadInLevel0(IsSPMD))
+  {
     return -1;
+  }
 
   // Enter the generic state machine if enabled and if this thread can possibly
   // be an active worker thread.
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..41f47ad7a78539 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -322,6 +322,40 @@ uint32_t mapping::getNumberOfProcessorElements() {
   return static_cast<uint32_t>(config::getHardwareParallelism());
 }
 
+uint32_t mapping::getSimdLen() {
+  return 1;
+}
+
+uint32_t mapping::getSimdGroup() {
+  uint32_t SimdGroup = mapping::getThreadIdInBlock() / mapping::getSimdLen();
+  return SimdGroup;
+}
+
+uint32_t mapping::getSimdLane() {
+  uint32_t SimdId = mapping::getThreadIdInWarp() % mapping::getSimdLen();
+  return SimdId;
+}
+
+bool mapping::isSimdLeader() {
+  return !mapping::getSimdLane();
+}
+
+uint32_t mapping::getNumSimdGroups() {
+  //uint32_t NumGroups = mapping::getBlockSize() / mapping::getSimdLen();
+  uint32_t NumGroups = state::getEffectivePTeamSize() / mapping::getSimdLen();
+  return NumGroups;
+}
+
+LaneMaskTy mapping::simdmask() {
+  uint32_t GroupSize = mapping::getSimdLen();
+  uint32_t Group = mapping::getSimdGroup();
+  uint32_t WarpSize = mapping::getWarpSize();
+  LaneMaskTy Mask = ~(LaneMaskTy)0;
+  Mask = Mask >> (sizeof(LaneMaskTy)*8 - GroupSize);
+  Mask = Mask << (Group * GroupSize) % WarpSize;
+  return Mask;
+}
+
 ///}
 
 /// Execution mode
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 5286d53b623f0a..fe6eb3590d92e8 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -87,8 +87,10 @@ extern "C" {
                                                    int32_t num_threads,
                                                    void *fn, void **args,
                                                    const int64_t nargs) {
+  //printf("SPMD mode\n");
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  NumThreads = NumThreads / mapping::getSimdLen();
   uint32_t PTeamSize =
       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
   // Avoid the race between the read of the `icv::Level` above and the write
@@ -101,6 +103,9 @@ extern "C" {
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, TId == 0, ident,
                                           /*ForceTeamState=*/true);
+    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+    //                                 1u, TId == 0, ident,
+    //                                 /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
                                      /*ForceTeamState=*/true);
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
@@ -119,7 +124,7 @@ extern "C" {
     // assumptions above.
     synchronize::threadsAligned(atomic::relaxed);
 
-    if (!PTeamSize || TId < PTeamSize)
+    if (!PTeamSize || (TId < PTeamSize*mapping::getSimdLen()))
       invokeMicrotask(TId, 0, fn, args, nargs);
 
     // Synchronize all threads at the end of a parallel region.
@@ -141,6 +146,8 @@ extern "C" {
   return;
 }
 
+
+
 [[clang::always_inline]] void
 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                    int32_t num_threads, int proc_bind, void *fn,
@@ -166,6 +173,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);
 
+  //printf("num_threads=%i\n", num_threads);
+  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  //printf("NumThreads=%i\n", NumThreads);
+  NumThreads = NumThreads / mapping::getSimdLen();
+  //printf("New NumThreads=%i\n", NumThreads);
+  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
+  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
+  //printf("PTeamSize=%i\n", PTeamSize);
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
@@ -185,7 +200,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // set, but they do not have individual ThreadStates yet. If they ever
   // modify the ICVs beyond this point a ThreadStates will be allocated.
 
-  bool IsActiveParallelRegion = NumThreads > 1;
+  bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1;
   if (!IsActiveParallelRegion) {
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
     invokeMicrotask(TId, 0, fn, args, nargs);
@@ -254,12 +269,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   }
 
   {
+    //printf("Generic execution\n");
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
+    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
+    //                                 1u, TId == 0, ident,
+    //                                 /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
                                           (void *)nullptr, true, ident,
                                           /*ForceTeamState=*/true);
@@ -288,7 +307,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
   // Set to true for workers participating in the parallel region.
   uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
+  bool ThreadIsActive = TId < state::getEffectivePTeamSize()*mapping::getSimdLen();
   return ThreadIsActive;
 }
 
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d3f28e5..402c33c7779eb4 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -164,9 +164,57 @@ uint32_t roundToWarpsize(uint32_t s) {
 
 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 
+static int32_t nvptx_simd_reduce_nowait(void *reduce_data,
+                                            ShuffleReductFnTy shflFct,
+                                            InterWarpCopyFnTy cpyFct) {
+  uint32_t SimdId = mapping::getSimdLane();
+  uint32_t NumThreads = mapping::getSimdLen();
+  if(NumThreads == 1)
+    return 1;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  if (NumThreads == mapping::getWarpSize())
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else 
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads,
+                              /*LaneId=*/mapping::getSimdLane());
+#else
+  __kmpc_impl_lanemask_t Liveness = mapping::simdmask();
+  if (Liveness == lanes::All) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else 
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/utils::popc(Liveness),
+                              /*LaneId=*/mapping::getSimdLane());
+#endif
+
+  return mapping::isSimdLeader();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
 } // namespace
 
 extern "C" {
+int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc,
+                                           uint64_t reduce_data_size,
+                                           void *reduce_data,
+                                           ShuffleReductFnTy shflFct,
+                                           InterWarpCopyFnTy cpyFct) {
+  return nvptx_simd_reduce_nowait(reduce_data, shflFct, cpyFct);
+}
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 855c74fa58e0a5..8c96d4cfd6d011 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -364,13 +364,18 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 }
 
 int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  //return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  return returnValIfLevelIsActive(Level, mapping::getSimdGroup(), 0);
 }
 
 int omp_get_thread_num(void) {
   return omp_get_ancestor_thread_num(omp_get_level());
 }
 
+int omp_get_simd_lane(void) {
+  return mapping::getSimdLane();
+}
+
 int omp_get_team_size(int Level) {
   return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 }
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 9ea8d171cc830e..4d56b7fc149448 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -533,6 +533,10 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
   impl::namedBarrier();
 }
 
+void __kmpc_simd_barrier(void) {
+  synchronize::warp(mapping::simdmask());
+}
+
 [[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
   synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
 }
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index ad60e66548be90..5086eb4966179c 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -518,6 +518,28 @@ void workshare::init(bool IsSPMD) {
     ThreadDST = nullptr;
 }
 
+template<typename IType>
+void SimdLoop(
+  IdentTy *ident, void *WorkFn, IType TripCount,
+  void **Args
+) {
+  ASSERT(WorkFn, "expected valid outlined function"); 
+  __kmpc_impl_lanemask_t SimdMask = mapping::simdmask();
+  uint32_t Step = mapping::getSimdLen();
+
+  //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n",
+  //       mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount);
+
+  synchronize::warp(SimdMask);
+  for(IType omp_iv = (IType) mapping::getSimdLane();
+      omp_iv < TripCount;
+      omp_iv += Step
+  ) {
+    ((void (*)(IType, void**))WorkFn)(omp_iv, Args);
+  }
+  synchronize::warp(SimdMask);
+}
+
 extern "C" {
 
 // init
@@ -683,6 +705,28 @@ void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+
+void __kmpc_simd_4u(
+  IdentTy *ident, void *WorkFn, uint32_t TripCount,
+  void **Args
+) {
+  SimdLoop<uint32_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd_8u(
+  IdentTy *ident, void *WorkFn, uint64_t TripCount,
+  void **Args
+) {
+  SimdLoop<uint64_t>(ident, WorkFn, TripCount, Args);
+}
+
+void __kmpc_simd(
+  IdentTy *ident, void *WorkFn, void **Args, uint32_t nargs
+) {
+  ASSERT(WorkFn, "expected valid outlined function"); 
+  ((void (*)(void**))WorkFn)(Args);
+}
+
 }
 
 namespace ompx {

>From ec9fa48937da7f88aceaeb06e5efaed0d42dabd7 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzvernal10.llnl.gov>
Date: Fri, 14 Jun 2024 08:44:22 -0700
Subject: [PATCH 2/5] Changed where certain variables are allocated so that it
 is consistent

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 1d5b24475d1d1b..07557d66cde398 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1602,7 +1602,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
   
   InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
   assert(DistanceCB && "expected loop trip count callback function!");
-  Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+  //Value *DistVal = DistanceCB(EntryBB, DistanceIP);
+  Value *DistVal = DistanceCB(OuterAllocaBlock, DistanceIP);
   assert(DistVal && "trip count call back should return integer trip count");
   Type *DistValType = DistVal->getType();
   assert(DistValType->isIntegerTy() && "trip count should be integer type");

>From 5fba287fb12a2b18c55270fa51d5f587b53f0c89 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzansel61.coral.llnl.gov>
Date: Thu, 3 Oct 2024 23:40:20 -0700
Subject: [PATCH 3/5] Added SimdLen to the team state and removed unneeded
 globalization code

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 154 ----------------------
 offload/DeviceRTL/include/Mapping.h       |  15 +++
 offload/DeviceRTL/include/State.h         |   7 +
 offload/DeviceRTL/src/Mapping.cpp         |   2 +-
 offload/DeviceRTL/src/Parallelism.cpp     |  25 ++--
 offload/DeviceRTL/src/State.cpp           |   5 +-
 6 files changed, 40 insertions(+), 168 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 07557d66cde398..0798ea353c5d19 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -800,8 +800,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
   for (Function *F : ConstantAllocaRaiseCandidates)
     raiseUserConstantDataAllocasToEntryBlock(Builder, F);
 
-  //globalizeVars(Fn);
-
   EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
       [](EmitMetadataErrorKind Kind,
          const TargetRegionEntryInfo &EntryInfo) -> void {
@@ -821,158 +819,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
 }
 
-CallInst * OpenMPIRBuilder::globalizeAlloca(
-  AllocaInst *Alloca,
-  SmallVector<Instruction*, 32> &ToBeDeleted
-) {
-  FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr(
-    OMPRTL___kmpc_alloc_shared
-  );
-
-  Builder.SetInsertPoint(Alloca);
-  Value *SharedAllocArgs[] = {
-    //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8)
-
-    //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout()));
-    //ConstantExpr::getSizeOf(Alloca->getAllocatedType())
-    ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue())
-  };
-
-  CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1));
-  AllocSharedCall->setName(Alloca->getName() + "_on_stack");
-  //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack");
-
-  dbgs() << "Created " << *AllocSharedCall << "\n";
-  dbgs() << *(Alloca->getType()) << "\n";
-  dbgs() << *(AllocSharedCall->getType()) << "\n";
-
-  //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0);
-  //dbgs() << " " << *CastType << "\n";
-  //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast(
-  //  AllocSharedCall, CastType, Alloca->getName()+"_on_stack"
-  //);
-
-  //dbgs() << " Casted " << *CastedSharedAlloc << "\n";
-
-  //Alloca->replaceAllUsesWith(AllocSharedCall);
-
-  // If the Alloca was allocated in address space 5 (local) we need to
-  // account for a type mismatch between it and the return from __kmpc_shared_alloc
-
-  for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) {
-    dbgs () << " User - " << *(*U) << "\n";
-  }
-
-  if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) {
-    auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back());
-    dbgs() << *(AddrSpaceCast->getType()) << "\n";
-    AddrSpaceCast->replaceAllUsesWith(AllocSharedCall);
-    //AddrSpaceCast->removeFromParent();
-    ToBeDeleted.push_back(AddrSpaceCast);
-  } else {
-    Alloca->replaceAllUsesWith(AllocSharedCall);
-  }
-  ToBeDeleted.push_back(Alloca);
-  //Alloca->removeFromParent();
-
-  //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) {
-  //  if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) {
-  //    if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) {
-  //      AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc);
-  //      AddrSpaceCast->removeFromParent();
-  //    }
-  //  }
-  //}
-
-  //Alloca->removeFromParent();
-
-  dbgs() << "  var globalized!\n";
-
-  return AllocSharedCall;
-
-}
-
-void OpenMPIRBuilder::globalizeParallelVars(
-  llvm::Function *CurFn
-) {
-  SmallVector<Instruction*, 32> ToBeDeleted;
-  std::stack<CallInst*> GlobalizedVars;
-
-  dbgs() << "  Exploring: " << CurFn->getName() << "\n";
-  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
-  {
-    for(auto I = BB->begin(); I != BB->end(); I++)
-    {
-      if(auto Alloca = dyn_cast<AllocaInst>(I)) {
-        dbgs() << "    Found Alloca: " << *Alloca << "\n";
-        CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted);
-        GlobalizedVars.push(GlobalizedAlloca);
-      } else if(auto FnCall = dyn_cast<CallInst>(I)) {
-        dbgs() << "    Found Function Call: " << *FnCall << "\n";
-      }
-    }
-  }
-
-  BasicBlock &EndBlock = CurFn->back();
-  Builder.SetInsertPoint(EndBlock.begin());
-  while(!GlobalizedVars.empty()) {
-    CallInst *SharedAlloc = GlobalizedVars.top();
-    GlobalizedVars.pop();
-    FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr(
-      OMPRTL___kmpc_free_shared
-    );
-
-    Value *SharedFreeArgs[] = {
-      SharedAlloc,
-      SharedAlloc->getArgOperand(0)
-    };
-
-    CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2));
-    dbgs() << " Freed - " << *SharedFreeCall << "\n";
-  }
-
-  for(auto I : ToBeDeleted)
-    I->removeFromParent();
-
-}
-
-// Globalize any variables that are needed in a lower level of
-// the parallel hierarchy.
-// Only Vars used in 'simd' regions are supported right now.
-void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn)
-{
-
-  std::stack<llvm::AllocaInst> Allocas;
-  SmallPtrSet<AllocaInst*, 32> EscapedVars;
-
-  //dbgs() << "Function: " << CurFn->getName() << "\n";
-
-  for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++)
-  {
-    for(auto I = BB->begin(); I != BB->end(); I++)
-    {
-      //dbgs() << "  Instruction: " << *I << "\n";
-      if(auto FnCall = dyn_cast<CallInst>(I))
-      {
-        //dbgs() << "    Found call: " << *FnCall << "\n";
-        if(auto Fn = FnCall->getCalledFunction()) {
-          //dbgs() << "      " << Fn->getName() << "\n";
-          if(Fn->getName() == "__kmpc_parallel_51") {
-            //dbgs() << "        Parallel!\n";
-            
-            Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5));
-            assert(OutlinedFn && "failed to find GPU parallel outlined fn");
-
-
-            dbgs() << "Found a parallel region\n";
-            globalizeParallelVars(OutlinedFn);
-          }
-        }
-      }
-    }
-  }
-}
-
 OpenMPIRBuilder::~OpenMPIRBuilder() {
   assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
 }
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index d81ccff436cc2c..c5b73f8cc71f9b 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -105,11 +105,26 @@ uint32_t getMaxTeamThreads(bool IsSPMD);
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();
 
+/// Return the number of threads reserved for simd loops per parallel thread.
+/// This is between [1, getWarpSize()].
 uint32_t getSimdLen();
+
+/// Return what simd group the thread belongs to.
 uint32_t getSimdGroup();
+
+/// Return the thread ID within its simd group, in [0, getSimdLen())
 uint32_t getSimdLane();
+
+/// Return true if the thread is simd lane 0. I.e if this is the
+/// thread that executes parallel regions.
 bool isSimdLeader();
+
+/// Return the number of simd groups in the team. This is
+/// getMaxTeamThreads() / getSimdLen().
 uint32_t getNumSimdGroups();
+
+/// Return the lane mask that correlates to all threads within
+/// the simd group.
 LaneMaskTy simdmask();
 
 } // namespace mapping
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 565235cd48a913..24578ae1385aba 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -84,6 +84,7 @@ struct TeamStateTy {
   ///}
 
   uint32_t ParallelTeamSize;
+  uint32_t SimdLength;
   uint32_t HasThreadState;
   ParallelRegionFnTy ParallelRegionFnVar;
 };
@@ -140,6 +141,7 @@ enum ValueKind {
   VK_RunSchedChunk,
   VK_ParallelRegionFn,
   VK_ParallelTeamSize,
+  VK_SimdLength,
   VK_HasThreadState,
 };
 
@@ -217,6 +219,8 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
                                  ForceTeamState);
   case state::VK_ParallelTeamSize:
     return TeamState.ParallelTeamSize;
+  case state::VK_SimdLength:
+    return TeamState.SimdLength;
   case state::VK_HasThreadState:
     return TeamState.HasThreadState;
   default:
@@ -340,6 +344,9 @@ inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
 /// TODO
 inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
 
+/// TODO
+inline state::Value<uint32_t, state::VK_SimdLength> SimdLength;
+
 /// TODO
 inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 41f47ad7a78539..c491708b6225d0 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -323,7 +323,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
 }
 
 uint32_t mapping::getSimdLen() {
-  return 1;
+  return state::SimdLength;
 }
 
 uint32_t mapping::getSimdGroup() {
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index fe6eb3590d92e8..85baf580465f30 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -87,10 +87,12 @@ extern "C" {
                                                    int32_t num_threads,
                                                    void *fn, void **args,
                                                    const int64_t nargs) {
-  //printf("SPMD mode\n");
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  NumThreads = NumThreads / mapping::getSimdLen();
+
+  // Any threads leftover from the team max vs. what's used in the
+  // parallel region are reserved for simd loops
+  uint32_t SimdLen = mapping::getMaxTeamThreads() / NumThreads;
   uint32_t PTeamSize =
       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
   // Avoid the race between the read of the `icv::Level` above and the write
@@ -103,9 +105,9 @@ extern "C" {
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, TId == 0, ident,
                                           /*ForceTeamState=*/true);
-    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
-    //                                 1u, TId == 0, ident,
-    //                                 /*ForceTeamState=*/true);
+    state::ValueRAII SimdLengthRAII(state::SimdLength, SimdLen,
+                                     1u, TId == 0, ident,
+                                     /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
                                      /*ForceTeamState=*/true);
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
@@ -173,14 +175,11 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);
 
-  //printf("num_threads=%i\n", num_threads);
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  //printf("NumThreads=%i\n", NumThreads);
   NumThreads = NumThreads / mapping::getSimdLen();
-  //printf("New NumThreads=%i\n", NumThreads);
+  uint32_t SimdLen = mapping::getMaxTeamThreads() / NumThreads;
   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-  //printf("PTeamSize=%i\n", PTeamSize);
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
@@ -200,6 +199,8 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // set, but they do not have individual ThreadStates yet. If they ever
   // modify the ICVs beyond this point a ThreadStates will be allocated.
 
+
+  // 
   bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1;
   if (!IsActiveParallelRegion) {
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
@@ -276,9 +277,9 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
-    //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen,
-    //                                 1u, TId == 0, ident,
-    //                                 /*ForceTeamState=*/true);
+    state::ValueRAII SimdLengthRAII(state::SimdLength, SimdLen,
+                                     1u, TId == 0, ident,
+                                     /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
                                           (void *)nullptr, true, ident,
                                           /*ForceTeamState=*/true);
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 8c96d4cfd6d011..91b1689e0ac57e 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -207,6 +207,7 @@ void state::TeamStateTy::init(bool IsSPMD) {
   ICVState.RunSchedVar = omp_sched_static;
   ICVState.RunSchedChunkVar = 1;
   ParallelTeamSize = 1;
+  SimdLength = 1;
   HasThreadState = false;
   ParallelRegionFnVar = nullptr;
 }
@@ -214,12 +215,14 @@ void state::TeamStateTy::init(bool IsSPMD) {
 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
   return (ICVState == Other.ICVState) &
          (HasThreadState == Other.HasThreadState) &
-         (ParallelTeamSize == Other.ParallelTeamSize);
+         (ParallelTeamSize == Other.ParallelTeamSize) &
+         (SimdLength == Other.SimdLength);
 }
 
 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
   ICVState.assertEqual(Other.ICVState);
   ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
+  ASSERT(SimdLength == Other.SimdLength, nullptr);
   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
 }
 

>From ccbde9005b0f432e87bbd360b2216d3f221810c4 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzansel61.coral.llnl.gov>
Date: Fri, 4 Oct 2024 08:58:50 -0700
Subject: [PATCH 4/5] Removing extra whitespaces and comments throughout

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp       |  2 -
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp    |  2 -
 clang/lib/CodeGen/CGStmtOpenMP.cpp          |  8 +--
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp   | 58 ++++-----------------
 llvm/lib/Transforms/Utils/CodeExtractor.cpp |  8 +--
 offload/DeviceRTL/src/Kernel.cpp            |  2 -
 offload/DeviceRTL/src/Parallelism.cpp       |  1 -
 offload/DeviceRTL/src/Workshare.cpp         |  3 --
 8 files changed, 12 insertions(+), 72 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 836253ab1a7d8b..3747b00d4893ad 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1035,7 +1035,6 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
     : CGM(CGM), OMPBuilder(CGM.getModule()) {
-
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
   llvm::OpenMPIRBuilderConfig Config(
       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
@@ -1057,7 +1056,6 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
 }
 
 void CGOpenMPRuntime::clear() {
-
   InternalVars.clear();
   // Clean non-target variable declarations possibly used only in debug info.
   for (const auto &Data : EmittedNonTargetVariables) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 16aff085579807..e1b2b499c9bbcb 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -262,7 +262,6 @@ class CheckVarsEscapingDeclContext final
                                bool IsCombinedParallelRegion) {
     if (!S)
       return;
-
     for (const CapturedStmt::Capture &C : S->captures()) {
       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
         const ValueDecl *VD = C.getCapturedVar();
@@ -337,7 +336,6 @@ class CheckVarsEscapingDeclContext final
       return;
     if (!D->hasAssociatedStmt())
       return;
-
     if (const auto *S =
             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
       // Do not analyze directives that do not actually require capturing,
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index b4e699c1d003b8..52812ba6ab2451 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2771,8 +2771,6 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S,
 
       auto *LoopVarRef = CL->getLoopVarRef();
       LValue LCVal = EmitLValue(LoopVarRef);
-      //Address LoopVarAddress = LCVal.getAddress(*this);
-      //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer());
       LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this));
       LoopVarName = LoopVarRef->getNameInfo().getAsString();
 
@@ -2786,7 +2784,6 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S,
                            ->getType()
                            .getNonReferenceType();
 
-      //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
       RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
  
       emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
@@ -2809,8 +2806,7 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S,
     };
 
     auto BodyGenCB = [&]
-                     (//InsertPointTy OuterAllocaIP,
-                      llvm::BasicBlock *OuterAllocaBB,
+                     (llvm::BasicBlock *OuterAllocaBB,
                       InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
                       InsertPointTy Prolog, InsertPointTy ReductionEpilog,
                       llvm::Value *Virtual) {
@@ -2869,7 +2865,7 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S,
     ));
 
     return;
-  } 
+  }
 
   CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S);
   CGF.OMPFirstScanLoop = true;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0798ea353c5d19..e6ac271b442ecc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -978,11 +978,7 @@ OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  if (!ThreadID)
-    ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
-
-  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
-                   ThreadID};
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags)};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -1448,7 +1444,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
   
   InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator());
   assert(DistanceCB && "expected loop trip count callback function!");
-  //Value *DistVal = DistanceCB(EntryBB, DistanceIP);
   Value *DistVal = DistanceCB(OuterAllocaBlock, DistanceIP);
   assert(DistVal && "trip count call back should return integer trip count");
   Type *DistValType = DistVal->getType();
@@ -1459,15 +1454,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
 
   // Create the virtual iteration variable that will be pulled into
   // the outlined function.
-  //Builder.restoreIP(OuterAllocaIP);
   Builder.SetInsertPoint(EntryBB, EntryBB->begin());
   AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp");
   Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv");
-  //InsertPointTy MidAllocaIP = Builder.saveIP();
 
   // Generate the privatization allocas in the block that will become the entry
   // of the outlined function.
-//  Builder.SetInsertPoint(LoopEntryBB->getTerminator());
   Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin());
   // Use omp.iv in the outlined region so it gets captured during the outline
   Instruction *OMPIVUse = dyn_cast<Instruction>(
@@ -1484,7 +1476,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
 
   LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n");
   assert(BodyGenCB && "Expected body generation callback!");
-  InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin());
+  InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); 
 
   InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator());
   InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin());
@@ -1505,20 +1497,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
   {
     OutlineInfo OI;
 
-    // Adjust the finalization stack, verify the adjustment, and call the
-    // finalize function a last time to finalize values between the pre-fini
-    // block and the exit block if we left the parallel "the normal way".
-    //auto FiniInfo = FinalizationStack.pop_back_val();
-    //(void)FiniInfo;
-    //assert(FiniInfo.DK == OMPD_simd && 
-    //       "Unexpected finalization stack state!");
-
     Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator();
 
     InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator());
     FiniCB(PreFiniIP);
 
-    OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock;
+    OI.OuterAllocaBB = EntryBB;
     OI.EntryBB = LoopEntryBB;
     OI.ExitBB = LoopExitBB;
 
@@ -1526,13 +1510,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
     SmallVector<BasicBlock *, 32> Blocks;
     OI.collectBlocks(ParallelRegionBlockSet, Blocks);
 
-    // Ensure a single exit node for the outlined region by creating one.
-    // We might have multiple incoming edges to the exit now due to finalizations,
-    // e.g., cancel calls that cause the control flow to leave the region.
-    //BasicBlock *PRegOutlinedExitBB = PRegExitBB;
-    //PRegExitBB = LRegExitBB;
-    //PRegOutlinedExitBB->setName("omp.loop.outlined.exit");
-
     Blocks.push_back(LoopExitBB);
 
     CodeExtractorAnalysisCache CEAC(*OuterFn);
@@ -1621,7 +1598,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
 
     LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
     for (auto *BB : Blocks) {
-      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+      dbgs() << " PBR: " << BB->getName() << "\n";
     }
 
     int NumInputs = Inputs.size()-1; // One argument is always omp.iv
@@ -1672,8 +1649,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
     OutlineInfo OI;
 
     OI.OuterAllocaBB = OuterAllocaBlock;
-    OI.EntryBB = EntryBB; //LoopEntryBB;
-    OI.ExitBB = FinalizeBB; //LoopExitBB;
+    OI.EntryBB = EntryBB;
+    OI.ExitBB = FinalizeBB;
 
     SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
     SmallVector<BasicBlock *, 32> Blocks;
@@ -1697,12 +1674,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
     Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
 
     auto PrivHelper = [&](Value &V) {
-      // Exclude omp.iv from aggregate
-      //if (&V == OMPIV) {
-      //  OI.ExcludeArgsFromAggregate.push_back(&V);
-      //  return;
-      //}
-
       // Get all uses of value that are inside of the outlined region
       SetVector<Use *> Uses;
       for (Use &U : V.uses())
@@ -1810,11 +1781,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
     addOutlineInfo(std::move(OI));
   }
 
-
-
-
-
-  InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end());
+  InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end());
   UI->eraseFromParent();
 
   return AfterIP;
@@ -3876,6 +3843,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
     bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+    bool IsSimdReduction,
     ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
     unsigned ReductionBufNum, Value *SrcLocInfo) {
   if (!updateToLocation(Loc))
@@ -3967,9 +3935,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
         Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
     Value *WcFuncCast =
         Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
-    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
-    //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
-    //    RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
     Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr(
         RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2);
     Res = Builder.CreateCall(SimdReduceFn, Args);
@@ -9066,7 +9032,6 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
       [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
           const TargetRegionEntryInfo &EntryInfo,
           const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
-
         // Generate metadata for target regions. Each entry of this metadata
         // contains:
         // - Entry 0 -> Kind of this type of metadata (0).
@@ -9491,7 +9456,6 @@ bool OffloadEntriesInfoManager::empty() const {
 
 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) const {
-
   auto It = OffloadEntriesTargetRegionCount.find(
       getTargetRegionEntryCountKey(EntryInfo));
   if (It == OffloadEntriesTargetRegionCount.end())
@@ -9501,7 +9465,6 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
 
 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
     const TargetRegionEntryInfo &EntryInfo) {
-
   OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
       EntryInfo.Count + 1;
 }
@@ -9509,7 +9472,6 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
 /// Initialize target region entry.
 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
     const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
-
   OffloadEntriesTargetRegion[EntryInfo] =
       OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
                                    OMPTargetRegionEntryTargetRegion);
@@ -9519,7 +9481,6 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
     TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
     OMPTargetRegionEntryKind Flags) {
-
   assert(EntryInfo.Count == 0 && "expected default EntryInfo");
 
   // Update the EntryInfo with the next available count for this location.
@@ -9567,7 +9528,6 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
 
 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
     const OffloadTargetRegionEntryInfoActTy &Action) {
-
   // Scan all target region entries and perform the provided action.
   for (const auto &It : OffloadEntriesTargetRegion) {
     Action(It.first, It.second);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 0ad2b3a055f6c4..7d72330f53975a 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1194,7 +1194,6 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   }
 
   StructType *StructArgTy = nullptr;
-  //AllocaInst *Struct = nullptr;
   Instruction *Struct = nullptr;
   unsigned NumAggregatedInputs = 0;
   if (AggregateArgs && !StructValues.empty()) {
@@ -1211,14 +1210,9 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
       auto *StructSpaceCast = new AddrSpaceCastInst(
-        Struct, PointerType ::get(Context, 0), "structArg.ascast");
+          Struct, PointerType ::get(Context, 0), "structArg.ascast");
       StructSpaceCast->insertAfter(Struct);
-      // There isn't really a point in generating this cast if you
-      // just aren't going to use it...
       Struct = StructSpaceCast;
-      //params.push_back(StructSpaceCast);
-    } else {
-      //params.push_back(Struct);
     }
     params.push_back(Struct);
     // Store aggregated inputs in the struct.
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index e6c01d1741821a..0ca6798fe128e6 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -100,9 +100,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
   }
 
   if (mapping::isInitialThreadInLevel0(IsSPMD))
-  {
     return -1;
-  }
 
   // Enter the generic state machine if enabled and if this thread can possibly
   // be an active worker thread.
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 85baf580465f30..b5f60f5b2080db 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -270,7 +270,6 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   }
 
   {
-    //printf("Generic execution\n");
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index 5086eb4966179c..c1747a70473b67 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -527,9 +527,6 @@ void SimdLoop(
   __kmpc_impl_lanemask_t SimdMask = mapping::simdmask();
   uint32_t Step = mapping::getSimdLen();
 
-  //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n",
-  //       mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount);
-
   synchronize::warp(SimdMask);
   for(IType omp_iv = (IType) mapping::getSimdLane();
       omp_iv < TripCount;

>From 341512d3d74addb356d1b829216ac32fb9e8f3a0 Mon Sep 17 00:00:00 2001
From: Eric Francis Wright <wright117 at rzvernal11.llnl.gov>
Date: Fri, 18 Oct 2024 08:54:46 -0700
Subject: [PATCH 5/5] Various formatting changes

---
 clang/lib/CodeGen/CGStmtOpenMP.cpp              |  1 -
 .../include/llvm/Frontend/OpenMP/OMPIRBuilder.h |  7 +------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp       | 17 +++++++----------
 llvm/lib/Transforms/Utils/CodeExtractor.cpp     |  7 ++++---
 offload/DeviceRTL/src/Kernel.cpp                |  2 +-
 offload/DeviceRTL/src/Parallelism.cpp           |  2 --
 offload/DeviceRTL/src/Reduction.cpp             | 13 -------------
 offload/DeviceRTL/src/State.cpp                 |  1 -
 8 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 52812ba6ab2451..0d075cd14b9b55 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1454,7 +1454,6 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     }
 
     const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
-llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n";
     EmitVarDecl(*VD);
     EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD),
                       /*Volatile=*/false, TaskRedRef->getType());
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 82041a7b2a03fb..3f496f0ff2f80d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -285,7 +285,7 @@ class OffloadEntriesInfoManager {
   /// Return true if a there are no entries defined.
   bool empty() const;
   /// Return number of entries defined so far.
-  unsigned size() const { return OffloadingEntriesNum /*OffloadEntriesTargetRegion.size()*/ /*OffloadingEntriesNum*/; }
+  unsigned size() const { return OffloadingEntriesNum; }
 
   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
 
@@ -514,11 +514,6 @@ class OpenMPIRBuilder {
   ///                              all functions are finalized.
   void finalize(Function *Fn = nullptr);
 
-  CallInst *globalizeAlloca(AllocaInst *Alloca, SmallVector<Instruction*, 32>&);
-  void globalizeParallelVars(Function *CurFn);
-  SmallPtrSet<Value*, 32> VarsNeedingGlobalization;
-  void globalizeVars(Function *CurFn);
-
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e6ac271b442ecc..b7fde5f0ded809 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -150,8 +150,6 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
-Function *GLOBAL_ReductionFunc = nullptr;
-
 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   if (T.isAMDGPU()) {
     StringRef Features =
@@ -808,7 +806,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
               "OMPIRBuilder finalization \n";
   };
 
-  if (!OffloadInfoManager.empty()) 
+  if (!OffloadInfoManager.empty())
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
 
   if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
@@ -816,7 +814,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
         M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
     emitUsed("llvm.compiler.used", LLVMCompilerUsed);
   }
-
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -978,7 +975,9 @@ OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags)};
+  Value *Args[] = {
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -2086,7 +2085,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
   LLVM_DEBUG({
     for (auto *BB : Blocks)
-      LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n");
+      dbgs() << " PBR: " << BB->getName() << "\n";
   });
 
   // Adjust the finalization stack, verify the adjustment, and call the
@@ -4060,9 +4059,6 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
-  if (ReductionInfos.size() == 0)
-    return Builder.saveIP();
-
   BasicBlock *InsertBlock = Loc.IP.getBlock();
   BasicBlock *ContinuationBlock =
       InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -4103,7 +4099,7 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
   const DataLayout &DL = Module->getDataLayout();
   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
-  Function *ReductionFunc = getFreshReductionFunc(M);
+  Function *ReductionFunc = getFreshReductionFunc(*Module);
   Value *Lock = getOMPCriticalRegionLock(".reduction");
   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -9369,6 +9365,7 @@ void OpenMPIRBuilder::registerTargetGlobalVariable(
     VarSize = M.getDataLayout().getPointerSize();
     Linkage = GlobalValue::WeakAnyLinkage;
   }
+
   OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
                                                       Flags, Linkage);
 }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 7d72330f53975a..15b26a38cc28ef 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1194,7 +1194,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   }
 
   StructType *StructArgTy = nullptr;
-  Instruction *Struct = nullptr;
+  AllocaInst *Struct = nullptr;
   unsigned NumAggregatedInputs = 0;
   if (AggregateArgs && !StructValues.empty()) {
     std::vector<Type *> ArgTypes;
@@ -1212,9 +1212,10 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       auto *StructSpaceCast = new AddrSpaceCastInst(
           Struct, PointerType ::get(Context, 0), "structArg.ascast");
       StructSpaceCast->insertAfter(Struct);
-      Struct = StructSpaceCast;
+      params.push_back(StructSpaceCast);
+    } else {
+      params.push_back(Struct);
     }
-    params.push_back(Struct);
     // Store aggregated inputs in the struct.
     for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
       if (inputs.contains(StructValues[i])) {
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 0ca6798fe128e6..8bb275eae776c6 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -42,9 +42,9 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
 static void genericStateMachine(IdentTy *Ident) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
-
   do {
     ParallelRegionFnTy WorkFn = nullptr;
+
     // Wait for the signal that we have a new work function.
     synchronize::threads(atomic::seq_cst);
 
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index b5f60f5b2080db..35e2a76b882773 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -199,8 +199,6 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // set, but they do not have individual ThreadStates yet. If they ever
   // modify the ICVs beyond this point a ThreadStates will be allocated.
 
-
-  // 
   bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1;
   if (!IsActiveParallelRegion) {
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 402c33c7779eb4..2efa322bc07e94 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -191,19 +191,6 @@ static int32_t nvptx_simd_reduce_nowait(void *reduce_data,
 
   return mapping::isSimdLeader();
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
 } // namespace
 
 extern "C" {
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 91b1689e0ac57e..aa15287fb0f1ae 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -367,7 +367,6 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 }
 
 int omp_get_ancestor_thread_num(int Level) {
-  //return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
   return returnValIfLevelIsActive(Level, mapping::getSimdGroup(), 0);
 }