[clang] [llvm] OpenMP offload 'simd' directive (PR #91261)

Mon May 6 12:43:18 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Eric Wright (efwright)

<details>
<summary>Changes</summary>

This is a portion of the implementation for the 'simd' directive for OpenMP offload. Included in this PR is new code generation for OpenMP 'simd' loops using the OpenMPIRBuilder, and some adjustments to how threads are mapped in the runtime that is eventually needed to support the extra level of thread parallelism that 'simd' brings.

There currently is no way for the programmer to adjust the 'simdlen', so it will use a default value of 1. For now, this means that using 'teams', 'parallel' and 'simd' together should currently behave the same as before, since the inner-most level of parallelism is assigned a length of 1.

To summarize the code generation - The loop has three important parts we need to generate code for:
1. LoopProlog - where the trip count of the loop is determined and any reduction/privatized variables are initialized
2. LoopBody - This is effectively the body of the loop. Some additional code is generated to convert a given "iteration variable" into a "loop variable". This code will then be outlined so that it can be passed as a "workload" into the runtime.
3. LoopEpilog - where reductions are finalized.

There is also an additional step in the code generation that is currently not being used due to it being unneeded without the full runtime implementation. This is where the loop is being generated for 'generic' mode, i.e if the containing 'parallel' region has code that the 'simd' threads should not hit. The relationship between 'parallel' and 'simd' in generic mode is very similar to the relationship of 'teams' and 'parallel' in generic mode, though they are independent (i.e you can have 'teams-parallel' be executed in SPMD mode and 'parallel-simd' executed in generic mode.

---

Patch is 179.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91261.diff


19 Files Affected:

- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+2) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+84-155) 
- (modified) clang/lib/CodeGen/CGStmtOpenMP.cpp (+133-54) 
- (modified) clang/lib/CodeGen/CodeGenFunction.cpp (+1-1) 
- (modified) clang/lib/Parse/ParseOpenMP.cpp (+2-4) 
- (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+6-6) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+610-29) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+12) 
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+2098-23) 
- (modified) llvm/lib/Transforms/Utils/CodeExtractor.cpp (+9-4) 
- (modified) offload/DeviceRTL/include/Interface.h (+11) 
- (modified) offload/DeviceRTL/include/Mapping.h (+7) 
- (modified) offload/DeviceRTL/src/Kernel.cpp (+3-1) 
- (modified) offload/DeviceRTL/src/Mapping.cpp (+34) 
- (modified) offload/DeviceRTL/src/Parallelism.cpp (+19-3) 
- (modified) offload/DeviceRTL/src/Reduction.cpp (+48) 
- (modified) offload/DeviceRTL/src/State.cpp (+6-1) 
- (modified) offload/DeviceRTL/src/Synchronization.cpp (+4) 
- (modified) offload/DeviceRTL/src/Workshare.cpp (+44) 


``````````diff

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e39c7c58d2780e..b515f7761ebc71 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1033,6 +1033,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
     : CGM(CGM), OMPBuilder(CGM.getModule()) {
+
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
   llvm::OpenMPIRBuilderConfig Config(
       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
@@ -1054,6 +1055,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
 }
 
 void CGOpenMPRuntime::clear() {
+
   InternalVars.clear();
   // Clean non-target variable declarations possibly used only in debug info.
   for (const auto &Data : EmittedNonTargetVariables) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87496c8e488c67..bef637907b0ed4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -262,6 +262,7 @@ class CheckVarsEscapingDeclContext final
                                bool IsCombinedParallelRegion) {
     if (!S)
       return;
+
     for (const CapturedStmt::Capture &C : S->captures()) {
       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
         const ValueDecl *VD = C.getCapturedVar();
@@ -336,13 +337,15 @@ class CheckVarsEscapingDeclContext final
       return;
     if (!D->hasAssociatedStmt())
       return;
+
     if (const auto *S =
             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
       // Do not analyze directives that do not actually require capturing,
       // like `omp for` or `omp simd` directives.
       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
-      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
+      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown &&
+          D->getDirectiveKind() != OMPD_simd) {
         VisitStmt(S->getCapturedStmt());
         return;
       }
@@ -502,23 +505,23 @@ class CheckVarsEscapingDeclContext final
 } // anonymous namespace
 
 /// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
+///// We assume that the warp size is 32, which is always the case
+///// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+    llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 }
-
+               
 /// Get the id of the current lane in the Warp.
 /// We assume that the warp size is 32, which is always the case
 /// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
+    llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
   assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
   unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
@@ -2812,9 +2815,11 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-#ifndef NDEBUG
+  bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
+  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
+
+  ASTContext &C = CGM.getContext();
 
   if (Options.SimpleReduction) {
     assert(!TeamsReduction && !ParallelReduction &&
@@ -2824,155 +2829,78 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
   }
 
-  assert((TeamsReduction || ParallelReduction) &&
-         "Invalid reduction selection in emitReduction.");
-
-  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
-  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
-  int Cnt = 0;
-  for (const Expr *DRE : Privates) {
-    PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
-    ++Cnt;
-  }
-
-  ASTContext &C = CGM.getContext();
-  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
-      CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
-
-  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
-  // RedList, shuffle_reduce_func, interwarp_copy_func);
-  // or
-  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  // Source location for theident struct
   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
 
-  llvm::Value *Res;
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  auto Size = RHSExprs.size();
-  for (const Expr *E : Privates) {
-    if (E->getType()->isVariablyModifiedType())
-      // Reserve place for array size.
-      ++Size;
-  }
-  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy = C.getConstantArrayType(
-      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
-      /*IndexTypeQuals=*/0);
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    CGF.Builder.CreateStore(
-        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
-        Elem);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
-  llvm::Function *ReductionFn = emitReductionFunction(
-      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
-      Privates, LHSExprs, RHSExprs, ReductionOps);
-  llvm::Value *ReductionDataSize =
-      CGF.getTypeSize(C.getRecordType(ReductionRec));
-  ReductionDataSize =
-      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
-  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
-      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
-  llvm::Value *InterWarpCopyFn =
-      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
-  if (ParallelReduction) {
-    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
-                           InterWarpCopyFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
-        Args);
-  } else {
-    assert(TeamsReduction && "expected teams reduction.");
-    TeamsReductions.push_back(ReductionRec);
-    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
-        {}, "_openmp_teams_reductions_buffer_$_$ptr");
-    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-
-    llvm::Value *Args[] = {
-        RTLoc,
-        KernelTeamsReductionPtr,
-        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
-        ReductionDataSize,
-        RL,
-        ShuffleAndReduceFn,
-        InterWarpCopyFn,
-        GlobalToBufferCpyFn,
-        GlobalToBufferRedFn,
-        BufferToGlobalCpyFn,
-        BufferToGlobalRedFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
-        Args);
-  }
-
-  // 5. Build if (res == 1)
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
-  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
-      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
-  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
-  // 6. Build then branch: where we have reduced values in the master
-  //    thread in each team.
-  //    __kmpc_end_reduce{_nowait}(<gtid>);
-  //    break;
-  CGF.EmitBlock(ThenBB);
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+                         CGF.AllocaInsertPt->getIterator());
+  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+                          CGF.Builder.GetInsertPoint());
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
-  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
-  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
-                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
-      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
-                                  cast<DeclRefExpr>(*IRHS));
-      ++IPriv;
-      ++ILHS;
-      ++IRHS;
+  CodeGenFunction::OMPPrivateScope Scope(CGF);
+  unsigned Idx = 0;
+  for (const Expr *Private : Privates) {
+    llvm::Type *ElementType;
+    llvm::Value *Variable;
+    llvm::Value *PrivateVariable;
+    llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+    ElementType = CGF.ConvertTypeForMem(Private->getType());
+    const auto *RHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+    PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getBasePointer();
+    const auto *LHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+    Variable = CGF.GetAddrOfLocalVar(LHSVar).getBasePointer();
+    llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+    switch (CGF.getEvaluationKind(Private->getType())) {
+    case TEK_Scalar:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+      break;
+    case TEK_Complex:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+      break;
+    case TEK_Aggregate:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+      break;
     }
-  };
-  RegionCodeGenTy RCG(CodeGen);
-  RCG(CGF);
-  // There is no need to emit line number for unconditional branch.
-  (void)ApplyDebugLocation::CreateEmpty(CGF);
-  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+    auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+                            llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+                            llvm::Function *NewFunc) {
+      CGF.Builder.restoreIP(CodeGenIP);
+      auto *CurFn = CGF.CurFn;
+      CGF.CurFn = NewFunc;
+
+      *LHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+                    .getBasePointer();
+      *RHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+                    .getBasePointer();
+
+      emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+                                  cast<DeclRefExpr>(LHSExprs[I]),
+                                  cast<DeclRefExpr>(RHSExprs[I]));
+
+      CGF.CurFn = CurFn;
+
+      return InsertPointTy(CGF.Builder.GetInsertBlock(),
+                           CGF.Builder.GetInsertPoint());
+    };
+    ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+        ElementType, Variable, PrivateVariable, EvalKind,
+        /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+    Idx++;
+  }
+
+  CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+      OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+      DistributeReduction, SimdReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+      CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+      RTLoc));
+  return;
 }
 
 const VarDecl *
@@ -3574,3 +3502,4 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
                                  CGM.getModule(), OMPRTL___kmpc_get_warp_size),
                              Args);
 }
+
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ef3aa3a8e0dc61..369e91d33de208 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1402,6 +1402,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     }
 
     const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl());
+llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n";
     EmitVarDecl(*VD);
     EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD),
                       /*Volatile=*/false, TaskRedRef->getType());
@@ -1442,7 +1443,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
                       isOpenMPParallelDirective(D.getDirectiveKind()) ||
                       TeamsLoopCanBeParallel || ReductionKind == OMPD_simd;
-    bool SimpleReduction = ReductionKind == OMPD_simd;
+    bool SimpleReduction = (CGM.getLangOpts().OpenMPIsTargetDevice ? false : ReductionKind == OMPD_simd);
     // Emit nowait reduction if nowait clause is present or directive is a
     // parallel directive (it always has implicit barrier).
     CGM.getOpenMPRuntime().emitReduction(
@@ -2673,61 +2674,139 @@ GetAlignedMapping(const OMPSimdDirective &S, CodeGenFunction &CGF) {
 }
 
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
-  bool UseOMPIRBuilder =
-      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
-  if (UseOMPIRBuilder) {
-    auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF,
-                                                          PrePostActionTy &) {
-      // Use the OpenMPIRBuilder if enabled.
-      if (UseOMPIRBuilder) {
-        llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars =
-            GetAlignedMapping(S, CGF);
-        // Emit the associated statement and get its loop representation.
-        const Stmt *Inner = S.getRawStmt();
-        llvm::CanonicalLoopInfo *CLI =
-            EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
-
-        llvm::OpenMPIRBuilder &OMPBuilder =
-            CGM.getOpenMPRuntime().getOMPBuilder();
-        // Add SIMD specific metadata
-        llvm::ConstantInt *Simdlen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Simdlen = Val;
-        }
-        llvm::ConstantInt *Safelen = nullptr;
-        if (const auto *C = S.getSingleClause<OMPSafelenClause>()) {
-          RValue Len =
-              this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(),
-                                /*ignoreResult=*/true);
-          auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal());
-          Safelen = Val;
-        }
-        llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown;
-        if (const auto *C = S.getSingleClause<OMPOrderClause>()) {
-          if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) {
-            Order = llvm::omp::OrderKind::OMP_ORDER_concurrent;
-          }
-        }
-        // Add simd metadata to the collapsed loop. Do not generate
-        // another loop for if clause. Support for if clause is done earlier.
-        OMPBuilder.applySimd(CLI, AlignedVars,
-                             /*IfCond*/ nullptr, Order, Simdlen, Safelen);
-        return;
-      }
+  bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsTargetDevice;
+  if(UseOMPIRBuilder) {
+    auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt());
+    auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt());
+    CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
+
+    CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
+    llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+      AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+
+    llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+
+    using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+    // Callback function for generating the trip count of the loop.
+    // This function should assign values to the TripCount and Signed variables
+    llvm::Value *LoopVar;
+    std::string LoopVarName;
+    EmittedClosureTy LoopVarClosure;
+
+    auto DistanceCB = [&](llvm::BasicBlock *AllocaBB,
+                          InsertPointTy CodeGenIP) -> llvm::Value* {
+      InsertPointTy AllocaIP(AllocaBB, AllocaBB->getTerminator()->getIterator());
+      OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(
+        *this, AllocaIP, *(CodeGenIP.getBlock()));
+      Builder.restoreIP(CodeGenIP);
+
+      // Emit the loop variable, needed for the distance func
+      const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt());
+      if(const Stmt *InitStmt = For->getInit())
+        EmitStmt(InitStmt);
+
+      auto *LoopVarRef = CL->getLoopVarRef();
+      LValue LCVal = EmitLValue(LoopVarRef);
+      //Address LoopVarAddress = LCVal.getAddress(*this);
+      //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer());
+      LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this));
+      LoopVarName = LoopVarRef->getNameInfo().getAsString();
+
+      // Emit the distance func from the CanonicalLoop
+      const CapturedStmt *DistanceFunc = CL->getDistanceFunc();
+      EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+
+      // Load the output and store it in the TripCount
+      QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+
+      //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+      RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+ 
+      emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+      auto *TripCount = Builder.CreateLoad(CountAddr, ".count");
+
+      const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc();
+      LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+      return TripCount;
     };
-    {
-      auto LPCRegion =
-          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-      OMPLexica...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/91261