[llvm] 29a3e3d - [OpenMPOpt] Expand SPMDization with guarding for target parallel regions

Giorgis Georgakoudis via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 4 11:49:29 PDT 2021


Author: Giorgis Georgakoudis
Date: 2021-08-04T11:49:24-07:00
New Revision: 29a3e3dd7bedb5cfd50277882cfc8d4b6b07ee2f

URL: https://github.com/llvm/llvm-project/commit/29a3e3dd7bedb5cfd50277882cfc8d4b6b07ee2f
DIFF: https://github.com/llvm/llvm-project/commit/29a3e3dd7bedb5cfd50277882cfc8d4b6b07ee2f.diff

LOG: [OpenMPOpt] Expand SPMDization with guarding for target parallel regions

This patch expands SPMDization (converting generic execution mode to SPMD for target regions) by guarding code regions that should be executed only by the main thread. Specifically, it generates guarded regions, which only the main thread executes, and the synchronization with worker threads using simple barriers. For correctness, the patch aborts SPMDization for target regions if the same code executes in a parallel region, thus must be not be guarded. This check is implemented using the ParallelLevels AA.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D106892

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/custom_state_machines.ll
    llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
    llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
    llvm/test/Transforms/OpenMP/remove_globalization.ll
    llvm/test/Transforms/OpenMP/replace_globalization.ll
    llvm/test/Transforms/OpenMP/spmdization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index da60142bce626..4e44e7abfc4d4 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -523,7 +523,7 @@ struct KernelInfoState : AbstractState {
   /// State to track if we are in SPMD-mode, assumed or know, and why we decided
   /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
   /// false.
-  BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;
+  BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
 
   /// The __kmpc_target_init call in this kernel, if any. If we find more than
   /// one we abort as the kernel is malformed.
@@ -2821,6 +2821,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
   AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
       : AAKernelInfo(IRP, A) {}
 
+  SmallPtrSet<Instruction *, 4> GuardedInstructions;
+
+  SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
+    return GuardedInstructions;
+  }
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // This is a high-level transform that might change the constant arguments
@@ -3021,6 +3027,188 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return false;
     }
 
+    auto CreateGuardedRegion = [&](Instruction *RegionStartI,
+                                   Instruction *RegionEndI) {
+      LoopInfo *LI = nullptr;
+      DominatorTree *DT = nullptr;
+      MemorySSAUpdater *MSU = nullptr;
+      using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+      BasicBlock *ParentBB = RegionStartI->getParent();
+      Function *Fn = ParentBB->getParent();
+      Module &M = *Fn->getParent();
+
+      // Create all the blocks and logic.
+      // ParentBB:
+      //    goto RegionCheckTidBB
+      // RegionCheckTidBB:
+      //    Tid = __kmpc_hardware_thread_id()
+      //    if (Tid != 0)
+      //        goto RegionBarrierBB
+      // RegionStartBB:
+      //    <execute instructions guarded>
+      //    goto RegionEndBB
+      // RegionEndBB:
+      //    <store escaping values to shared mem>
+      //    goto RegionBarrierBB
+      //  RegionBarrierBB:
+      //    __kmpc_simple_barrier_spmd()
+      //    // second barrier is omitted if lacking escaping values.
+      //    <load escaping values from shared mem>
+      //    __kmpc_simple_barrier_spmd()
+      //    goto RegionExitBB
+      // RegionExitBB:
+      //    <execute rest of instructions>
+
+      BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
+                                           DT, LI, MSU, "region.guarded.end");
+      BasicBlock *RegionBarrierBB =
+          SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
+                     MSU, "region.barrier");
+      BasicBlock *RegionExitBB =
+          SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
+                     DT, LI, MSU, "region.exit");
+      BasicBlock *RegionStartBB =
+          SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
+
+      assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
+             "Expected a 
diff erent CFG");
+
+      BasicBlock *RegionCheckTidBB = SplitBlock(
+          ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
+
+      // Register basic blocks with the Attributor.
+      A.registerManifestAddedBasicBlock(*RegionEndBB);
+      A.registerManifestAddedBasicBlock(*RegionBarrierBB);
+      A.registerManifestAddedBasicBlock(*RegionExitBB);
+      A.registerManifestAddedBasicBlock(*RegionStartBB);
+      A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
+
+      bool HasBroadcastValues = false;
+      // Find escaping outputs from the guarded region to outside users and
+      // broadcast their values to them.
+      for (Instruction &I : *RegionStartBB) {
+        SmallPtrSet<Instruction *, 4> OutsideUsers;
+        for (User *Usr : I.users()) {
+          Instruction &UsrI = *cast<Instruction>(Usr);
+          if (UsrI.getParent() != RegionStartBB)
+            OutsideUsers.insert(&UsrI);
+        }
+
+        if (OutsideUsers.empty())
+          continue;
+
+        HasBroadcastValues = true;
+
+        // Emit a global variable in shared memory to store the broadcasted
+        // value.
+        auto *SharedMem = new GlobalVariable(
+            M, I.getType(), /* IsConstant */ false,
+            GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
+            I.getName() + ".guarded.output.alloc", nullptr,
+            GlobalValue::NotThreadLocal,
+            static_cast<unsigned>(AddressSpace::Shared));
+
+        // Emit a store instruction to update the value.
+        new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
+
+        LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
+                                       I.getName() + ".guarded.output.load",
+                                       RegionBarrierBB->getTerminator());
+
+        // Emit a load instruction and replace uses of the output value.
+        for (Instruction *UsrI : OutsideUsers) {
+          assert(UsrI->getParent() == RegionExitBB &&
+                 "Expected escaping users in exit region");
+          UsrI->replaceUsesOfWith(&I, LoadI);
+        }
+      }
+
+      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+      // Go to tid check BB in ParentBB.
+      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+      ParentBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription Loc(
+          InsertPointTy(ParentBB, ParentBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(Loc);
+      auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
+      Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
+
+      // Add check for Tid in RegionCheckTidBB
+      RegionCheckTidBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
+          InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
+      FunctionCallee HardwareTidFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+      Value *Tid =
+          OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+      Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
+      OMPInfoCache.OMPBuilder.Builder
+          .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
+          ->setDebugLoc(DL);
+
+      // First barrier for synchronization, ensures main thread has updated
+      // values.
+      FunctionCallee BarrierFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_barrier_simple_spmd);
+      OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
+          RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
+      OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
+          ->setDebugLoc(DL);
+
+      // Second barrier ensures workers have read broadcast values.
+      if (HasBroadcastValues)
+        CallInst::Create(BarrierFn, {Ident, Tid}, "",
+                         RegionBarrierBB->getTerminator())
+            ->setDebugLoc(DL);
+    };
+
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
+
+    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+      BasicBlock *BB = GuardedI->getParent();
+      auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
+          IRPosition::function(*GuardedI->getFunction()), nullptr,
+          DepClassTy::NONE);
+      assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
+      auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
+      // Continue if instruction is already guarded.
+      if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
+        continue;
+
+      Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
+      for (Instruction &I : *BB) {
+        // If instruction I needs to be guarded update the guarded region
+        // bounds.
+        if (SPMDCompatibilityTracker.contains(&I)) {
+          CalleeAAFunction.getGuardedInstructions().insert(&I);
+          if (GuardedRegionStart)
+            GuardedRegionEnd = &I;
+          else
+            GuardedRegionStart = GuardedRegionEnd = &I;
+
+          continue;
+        }
+
+        // Instruction I does not need guarding, store
+        // any region found and reset bounds.
+        if (GuardedRegionStart) {
+          GuardedRegions.push_back(
+              std::make_pair(GuardedRegionStart, GuardedRegionEnd));
+          GuardedRegionStart = nullptr;
+          GuardedRegionEnd = nullptr;
+        }
+      }
+    }
+
+    for (auto &GR : GuardedRegions)
+      CreateGuardedRegion(GR.first, GR.second);
+
     // Adjust the global exec mode flag that tells the runtime what mode this
     // kernel is executed in.
     Function *Kernel = getAnchorScope();
@@ -3356,8 +3544,21 @@ struct AAKernelInfoFunction : AAKernelInfo {
         if (llvm::all_of(Objects,
                          [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
           return true;
+        // Check for AAHeapToStack moved objects which must not be guarded.
+        auto &HS = A.getAAFor<AAHeapToStack>(
+            *this, IRPosition::function(*I.getFunction()),
+            DepClassTy::REQUIRED);
+        if (llvm::all_of(Objects, [&HS](const Value *Obj) {
+              auto *CB = dyn_cast<CallBase>(Obj);
+              if (!CB)
+                return false;
+              return HS.isAssumedHeapToStack(*CB);
+            })) {
+          return true;
+        }
       }
-      // For now we give up on everything but stores.
+
+      // Insert instruction that needs guarding.
       SPMDCompatibilityTracker.insert(&I);
       return true;
     };
@@ -3371,6 +3572,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!IsKernelEntry) {
       updateReachingKernelEntries(A);
       updateParallelLevels(A);
+
+      if (!ParallelLevels.isValidState())
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
     }
 
     // Callback to check a call instruction.
@@ -3521,8 +3725,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
 
         // If SPMDCompatibilityTracker is not fixed, we need to give up on the
         // idea we can run something unknown in SPMD-mode.
-        if (!SPMDCompatibilityTracker.isAtFixpoint())
+        if (!SPMDCompatibilityTracker.isAtFixpoint()) {
+          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
           SPMDCompatibilityTracker.insert(&CB);
+        }
 
         // We have updated the state for this unknown call properly, there won't
         // be any change so we indicate a fixpoint.
@@ -3565,6 +3771,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPScheduleType::DistributeChunked:
         break;
       default:
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
         SPMDCompatibilityTracker.insert(&CB);
         break;
       };
@@ -3590,7 +3797,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       // We do not look into tasks right now, just give up.
       SPMDCompatibilityTracker.insert(&CB);
       ReachedUnknownParallelRegions.insert(&CB);
-      break;
+      indicatePessimisticFixpoint();
+      return;
     case OMPRTL___kmpc_alloc_shared:
     case OMPRTL___kmpc_free_shared:
       // Return without setting a fixpoint, to be resolved in updateImpl.
@@ -3599,7 +3807,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
       // generally.
       SPMDCompatibilityTracker.insert(&CB);
-      break;
+      indicatePessimisticFixpoint();
+      return;
     }
     // All other OpenMP runtime calls will not reach parallel regions so they
     // can be safely ignored for now. Since it is a known OpenMP runtime call we

diff  --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 6329fa6f61b74..8f57caceecf7c 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -16,12 +16,16 @@
 ;;
 ;; void no_state_machine_needed() {
 ;; #pragma omp target teams
-;;   no_parallel_region_in_here();
+;;   {
+;;     no_parallel_region_in_here();
+;;     unknown_no_openmp();
+;;   }
 ;; }
 ;;
 ;; void simple_state_machine() {
 ;; #pragma omp target teams
 ;;   {
+;;     unknown_no_openmp();
 ;; #pragma omp parallel
 ;;     { p0(); }
 ;;     no_parallel_region_in_here();
@@ -38,6 +42,7 @@
 ;; void simple_state_machine_interprocedural() {
 ;; #pragma omp target teams
 ;;   {
+;;     unknown_no_openmp();
 ;;     simple_state_machine_interprocedural_before();
 ;;     no_parallel_region_in_here();
 ;; #pragma omp parallel
@@ -75,6 +80,7 @@
 ;; void simple_state_machine_pure() {
 ;; #pragma omp target teams
 ;;   {
+;;     unknown_no_openmp();
 ;; #pragma omp parallel
 ;;     { p0(); }
 ;;     unknown_pure();
@@ -115,27 +121,22 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
-@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* undef
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
- at __omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode = weak constant i8 1
- at __omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_no_state_machine_needed_l14_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_l22_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_pure_l77_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_exec_mode = weak constant i8 1
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
 @G = external global i32, align 4
- at V = external global i1, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
- at llvm.compiler.used = appending global [8 x i8*] [i8* @__omp_offloading_2c_389eb_no_state_machine_needed_l14_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_l19_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72_exec_mode, i8* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86_exec_mode, i8* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106_exec_mode], section "llvm.metadata"
+ at llvm.compiler.used = appending global [8 x i8*] [i8* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_l22_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_exec_mode, i8* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_exec_mode, i8* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_exec_mode], section "llvm.metadata"
 
-; The second to last argument of __kmpc_target_init is set to false to indicate we do not need the generic runtime state machine.
-; The last argument is also set to false due to SPMDization.
-; No user code state machine is build because we do not need one.
-define weak void @__omp_offloading_2c_389eb_no_state_machine_needed_l14() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -147,7 +148,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -155,25 +156,19 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Verify we will not store a constant true here even though initially all call sites pass `i1 true` for  the second-to-last argument.
-define internal i32 @__kmpc_target_init(%struct.ident_t*, i1, i1 %use_generic_state_machine, i1) {
-  store i1 %use_generic_state_machine, i1* @V
-  %call = call i32 @unknown()
-  ret i32 %call
-}
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @no_parallel_region_in_here() #8
+  call void @no_parallel_region_in_here() #7
+  call void @unknown_no_openmp() #8
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define hidden void @no_parallel_region_in_here() #1 {
 entry:
   %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
@@ -191,14 +186,13 @@ omp_if.end:                                       ; preds = %omp_if.then, %entry
   ret void
 }
 
-; Function Attrs: nounwind
-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2
+declare void @unknown_no_openmp() #2
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3
 
 declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only one pointer comparison are needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_l19() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -210,7 +204,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -218,7 +212,6 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
@@ -227,32 +220,30 @@ entry:
   %captured_vars_addrs1 = alloca [0 x i8*], align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  call void @unknown_no_openmp() #8
   %0 = load i32*, i32** %.global_tid..addr, align 8
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %2, i64 0)
-  call void @no_parallel_region_in_here() #8
+  call void @no_parallel_region_in_here() #7
   %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0)
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent
-declare void @p0() #3
+declare void @p0() #4
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -262,7 +253,7 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
@@ -270,22 +261,19 @@ declare void @__kmpc_get_shared_variables(i8***)
 
 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p1() #8
+  call void @p1() #7
   ret void
 }
 
-; Function Attrs: convergent
-declare void @p1() #3
+declare void @p1() #4
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -295,13 +283,11 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only two pointer comparison are needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -313,7 +299,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -321,7 +307,6 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
@@ -329,17 +314,17 @@ entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @simple_state_machine_interprocedural_before() #8
-  call void @no_parallel_region_in_here() #8
+  call void @unknown_no_openmp() #8
+  call void @simple_state_machine_interprocedural_before() #7
+  call void @no_parallel_region_in_here() #7
   %0 = load i32*, i32** %.global_tid..addr, align 8
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %2, i64 0)
-  call void @simple_state_machine_interprocedural_after() #8
+  call void @simple_state_machine_interprocedural_after() #7
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define hidden void @simple_state_machine_interprocedural_before() #1 {
 entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
@@ -349,19 +334,17 @@ entry:
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p1() #8
+  call void @p1() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -371,11 +354,10 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define hidden void @simple_state_machine_interprocedural_after() #1 {
 entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
@@ -385,9 +367,7 @@ entry:
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. A fallback indirect call and only two pointer comparison are needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -399,7 +379,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -407,7 +387,6 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
@@ -420,25 +399,23 @@ entry:
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %2, i64 0)
-  %3 = call i32 @unknown() #8
-  %4 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
-  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** %4, i64 0)
+  %call = call i32 @unknown() #7
+  %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
+  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** %3, i64 0)
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -448,26 +425,23 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent
-declare i32 @unknown() #3
+declare i32 @unknown() #4
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p1() #8
+  call void @p1() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__8_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__8_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -477,13 +451,11 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__8(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__8(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -495,7 +467,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -503,7 +475,6 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__9(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
@@ -516,25 +487,23 @@ entry:
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** %2, i64 0)
-  call void @unknown_no_openmp() #9
+  call void @unknown_no_openmp() #8
   %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** %3, i64 0)
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__10(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__10_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__10_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -544,26 +513,21 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__10(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__10(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent
-declare void @unknown_no_openmp() #5
-
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__11(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p1() #8
+  call void @p1() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__11_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__11_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -573,13 +537,11 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__11(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__11(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_pure_l72() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -591,7 +553,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -599,7 +561,6 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__12(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
@@ -608,29 +569,28 @@ entry:
   %captured_vars_addrs1 = alloca [0 x i8*], align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  call void @unknown_no_openmp() #8
   %0 = load i32*, i32** %.global_tid..addr, align 8
   %1 = load i32, i32* %0, align 4
   %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** %2, i64 0)
-  call void @unknown_pure() #10
+  call void @unknown_pure() #9
   %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** %3, i64 0)
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__13(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__13_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__13_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -640,26 +600,23 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__13(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__13(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent nounwind readonly willreturn
-declare void @unknown_pure() #6
+declare void @unknown_pure() #5
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__14(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p1() #8
+  call void @p1() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__14_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__14_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -669,13 +626,11 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__14(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__14(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and no pointer comparison is needed.
-define weak void @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -687,7 +642,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -695,19 +650,17 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__15(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  %call = call i32 @omp_get_thread_num()
-  call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %call) #8
+  %call = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #7
+  call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %call) #7
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define hidden void @simple_state_machine_interprocedural_nested_recursive_after(i32 %a) #1 {
 entry:
   %a.addr = alloca i32, align 4
@@ -722,20 +675,17 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %entry
   %1 = load i32, i32* %a.addr, align 4
   %sub = sub nsw i32 %1, 1
-  call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %sub) #8
-  call void @simple_state_machine_interprocedural_nested_recursive_after_after() #8
+  call void @simple_state_machine_interprocedural_nested_recursive_after(i32 %sub) #7
+  call void @simple_state_machine_interprocedural_nested_recursive_after_after() #7
   br label %return
 
 return:                                           ; preds = %if.end, %if.then
   ret void
 }
 
-; Function Attrs: convergent
-declare i32 @omp_get_thread_num() #3
+declare i32 @omp_get_thread_num(...) #4
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A pretty generic user code state machine is build because we do not know anything about the weak callee.
-define weak void @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112() #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -747,7 +697,7 @@ entry:
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
   store i32 %1, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #3
   call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
   ret void
 
@@ -755,45 +705,38 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__16(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @weak_callee_empty() #8
+  call void @weak_callee_empty() #7
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define weak hidden void @weak_callee_empty() #1 {
 entry:
   ret void
 }
 
-; Function Attrs: convergent nounwind
-declare void @__kmpc_end_single(%struct.ident_t*, i32) #7
+declare i32 @__kmpc_single(%struct.ident_t*, i32) #6
 
-; Function Attrs: convergent nounwind
-declare i32 @__kmpc_single(%struct.ident_t*, i32) #7
+declare void @__kmpc_end_single(%struct.ident_t*, i32) #6
 
-; Function Attrs: convergent nounwind
-declare void @__kmpc_barrier(%struct.ident_t*, i32) #7
+declare void @__kmpc_barrier(%struct.ident_t*, i32) #6
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__17(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__17_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__17_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -803,23 +746,21 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__17(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__17(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__18(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__18_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__18_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -829,11 +770,10 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__18(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__18(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-; Function Attrs: convergent nounwind
 define hidden void @simple_state_machine_interprocedural_nested_recursive_after_after() #1 {
 entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
@@ -843,19 +783,17 @@ entry:
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
 define internal void @__omp_outlined__19(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
 entry:
   %.global_tid..addr = alloca i32*, align 8
   %.bound_tid..addr = alloca i32*, align 8
   store i32* %.global_tid., i32** %.global_tid..addr, align 8
   store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
-  call void @p0() #8
+  call void @p0() #7
   ret void
 }
 
-; Function Attrs: convergent norecurse nounwind
-define internal void @__omp_outlined__19_wrapper(i16 zeroext %0, i32 %1) #4 {
+define internal void @__omp_outlined__19_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
   %.addr = alloca i16, align 2
   %.addr1 = alloca i32, align 4
@@ -865,102 +803,91 @@ entry:
   store i16 %0, i16* %.addr, align 2
   store i32 %1, i32* %.addr1, align 4
   call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__19(i32* %.addr1, i32* %.zero.addr) #2
+  call void @__omp_outlined__19(i32* %.addr1, i32* %.zero.addr) #3
   ret void
 }
 
-attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #2 = { nounwind }
-attributes #3 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #4 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #5 = { convergent "frame-pointer"="all" "llvm.assume"="omp_no_openmp" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #6 = { convergent nounwind readonly willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
-attributes #7 = { convergent nounwind }
-attributes #8 = { convergent }
-attributes #9 = { convergent "llvm.assume"="omp_no_openmp" }
-attributes #10 = { convergent nounwind readonly willreturn }
+attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
+attributes #1 = { convergent noinline nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
+attributes #2 = { convergent "frame-pointer"="none" "llvm.assume"="omp_no_openmp" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
+attributes #3 = { nounwind }
+attributes #4 = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
+attributes #5 = { convergent nounwind readonly willreturn "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
+attributes #6 = { convergent nounwind }
+attributes #7 = { convergent }
+attributes #8 = { convergent "llvm.assume"="omp_no_openmp" }
+attributes #9 = { convergent nounwind readonly willreturn }
 
 !omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7}
 !nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15}
-!llvm.module.flags = !{!16, !17, !18, !20, !21}
-!llvm.ident = !{!19}
-
-!0 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural", i32 35, i32 2}
-!1 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_no_openmp_attr", i32 61, i32 4}
-!2 = !{i32 0, i32 44, i32 231915, !"no_state_machine_needed", i32 14, i32 0}
-!3 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_with_fallback", i32 50, i32 3}
-!4 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_pure", i32 72, i32 5}
-!5 = !{i32 0, i32 44, i32 231915, !"simple_state_machine_interprocedural_nested_recursive", i32 86, i32 6}
-!6 = !{i32 0, i32 44, i32 231915, !"no_state_machine_weak_callee", i32 106, i32 7}
-!7 = !{i32 0, i32 44, i32 231915, !"simple_state_machine", i32 19, i32 1}
-!8 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_needed_l14, !"kernel", i32 1}
-!9 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_l19, !"kernel", i32 1}
-!10 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35, !"kernel", i32 1}
-!11 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50, !"kernel", i32 1}
-!12 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61, !"kernel", i32 1}
-!13 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_pure_l72, !"kernel", i32 1}
-!14 = !{void ()* @__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86, !"kernel", i32 1}
-!15 = !{void ()* @__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106, !"kernel", i32 1}
+!llvm.module.flags = !{!16, !17, !18}
+
+!0 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
+!1 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
+!2 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_needed", i32 14, i32 0}
+!3 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_with_fallback", i32 55, i32 3}
+!4 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_pure", i32 77, i32 5}
+!5 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
+!6 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
+!7 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
+!8 = !{void ()* @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
+!9 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
+!10 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
+!11 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
+!12 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
+!13 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
+!14 = !{void ()* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
+!15 = !{void ()* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
 !16 = !{i32 1, !"wchar_size", i32 4}
-!17 = !{i32 7, !"PIC Level", i32 2}
-!18 = !{i32 7, !"frame-pointer", i32 2}
-!19 = !{!"clang version 13.0.0"}
-!20 = !{i32 7, !"openmp", i32 50}
-!21 = !{i32 7, !"openmp-device", i32 50}
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_needed_l14
+!17 = !{i32 7, !"openmp", i32 50}
+!18 = !{i32 7, !"openmp-device", i32 50}
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1:[0-9]+]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (%struct.ident_t* noalias nocapture nofree nonnull readnone align 8 dereferenceable(24) [[TMP0:%.*]], i1 [[TMP1:%.*]], i1 [[USE_GENERIC_STATE_MACHINE:%.*]], i1 [[TMP2:%.*]]) {
-; CHECK-NEXT:    store i1 false, i1* @V, align 4
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @unknown()
-; CHECK-NEXT:    ret i32 [[CALL]]
-;
-;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
 ; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
 ; CHECK:       omp_if.then:
 ; CHECK-NEXT:    store i32 0, i32* @G, align 4
-; CHECK-NEXT:    call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
 ; CHECK:       omp_if.end:
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -977,14 +904,14 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_l19
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1021,16 +948,16 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-NEXT:    call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
 ; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1039,6 +966,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
@@ -1048,17 +976,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1070,21 +998,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1096,18 +1024,18 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1150,16 +1078,16 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1167,6 +1095,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
@@ -1176,18 +1105,18 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -1198,17 +1127,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1220,22 +1149,22 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -1246,14 +1175,14 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1292,16 +1221,16 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1313,23 +1242,23 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1341,21 +1270,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1367,18 +1296,18 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1415,16 +1344,16 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-NEXT:    call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9
 ; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1436,23 +1365,23 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
-; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1464,21 +1393,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1490,30 +1419,64 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef true, i1 noundef false, i1 noundef false)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
+; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
+; CHECK:       worker_state_machine.begin:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
+; CHECK-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
+; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
+; CHECK-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
+; CHECK-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
+; CHECK:       worker_state_machine.finished:
+; CHECK-NEXT:    ret void
+; CHECK:       worker_state_machine.is_active.check:
+; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
+; CHECK:       worker_state_machine.parallel_region.check:
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*)
+; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
+; CHECK:       worker_state_machine.parallel_region.execute:
+; CHECK-NEXT:    call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]])
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
+; CHECK:       worker_state_machine.parallel_region.check1:
+; CHECK-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
+; CHECK:       worker_state_machine.parallel_region.execute2:
+; CHECK-NEXT:    call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]])
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; CHECK:       worker_state_machine.parallel_region.check3:
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; CHECK:       worker_state_machine.parallel_region.end:
+; CHECK-NEXT:    call void @__kmpc_kernel_end_parallel()
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; CHECK:       worker_state_machine.done.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
+; CHECK:       thread.user_code.check:
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-NEXT:    call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
+; CHECK-NEXT:    call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__12
 ; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1522,25 +1485,26 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1552,21 +1516,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1578,18 +1542,18 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1604,11 +1568,13 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper
+; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; CHECK:       worker_state_machine.parallel_region.check1:
+; CHECK:       worker_state_machine.parallel_region.fallback.execute:
+; CHECK-NEXT:    call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
 ; CHECK:       worker_state_machine.parallel_region.end:
 ; CHECK-NEXT:    call void @__kmpc_kernel_end_parallel()
@@ -1620,26 +1586,26 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
-; CHECK-NEXT:    call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__15
 ; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR2]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR9]]
 ; CHECK-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized
 ; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -1658,7 +1624,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after
 ; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -1672,21 +1638,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1
-; CHECK-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR8]]
-; CHECK-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR8]]
+; CHECK-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR9]]
+; CHECK-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR9]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       worker_state_machine.begin:
@@ -1713,15 +1679,15 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
-; CHECK-NEXT:    call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__16
 ; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1731,24 +1697,24 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@weak_callee_empty
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1760,21 +1726,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1786,22 +1752,22 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent noinline nounwind
 ; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
@@ -1812,17 +1778,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: convergent norecurse nounwind
+; CHECK: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper
 ; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -1834,63 +1800,57 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_needed_l14
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
 ; CHECK-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1:[0-9]+]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]]
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-DISABLED-SAME: (%struct.ident_t* noalias nocapture nofree nonnull readnone align 8 dereferenceable(24) [[TMP0:%.*]], i1 [[TMP1:%.*]], i1 [[USE_GENERIC_STATE_MACHINE:%.*]], i1 [[TMP2:%.*]]) {
-; CHECK-DISABLED-NEXT:    store i1 false, i1* @V, align 4
-; CHECK-DISABLED-NEXT:    [[CALL:%.*]] = call i32 @unknown()
-; CHECK-DISABLED-NEXT:    ret i32 [[CALL]]
-;
-;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7:[0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR8:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized
 ; CHECK-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR2]]
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-DISABLED-NEXT:    br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
 ; CHECK-DISABLED:       omp_if.then:
 ; CHECK-DISABLED-NEXT:    store i32 0, i32* @G, align 4
-; CHECK-DISABLED-NEXT:    call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    br label [[OMP_IF_END]]
 ; CHECK-DISABLED:       omp_if.end:
-; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -1907,26 +1867,26 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_l19
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -1935,6 +1895,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
@@ -1944,17 +1905,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8:[0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -1966,21 +1927,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -1992,30 +1953,30 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_l35
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2023,6 +1984,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR7]]
 ; CHECK-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
@@ -2032,18 +1994,18 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2054,17 +2016,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2076,22 +2038,22 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2102,26 +2064,26 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_with_fallback_l50
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2133,23 +2095,23 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
-; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]]
-; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
+; CHECK-DISABLED-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR9]]
+; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2161,21 +2123,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2187,30 +2149,30 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2222,23 +2184,23 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
-; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9:[0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2250,21 +2212,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2276,30 +2238,30 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef true, i1 noundef false, i1 noundef false)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
-; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2308,6 +2270,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; CHECK-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR8]]
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
@@ -2316,17 +2279,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2338,21 +2301,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p1() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2364,40 +2327,40 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_interprocedural_nested_recursive_l86
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__15
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR7]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized
 ; CHECK-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2416,7 +2379,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after
 ; CHECK-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2430,32 +2393,32 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED:       if.end:
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
 ; CHECK-DISABLED-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1
-; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR8]]
-; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR9]]
+; CHECK-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    br label [[RETURN]]
 ; CHECK-DISABLED:       return:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_no_state_machine_weak_callee_l106
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
 ; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLED:       user_code.entry:
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__16
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2465,24 +2428,24 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@weak_callee_empty
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2494,21 +2457,21 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2520,22 +2483,22 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2546,17 +2509,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19
 ; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR8]]
+; CHECK-DISABLED-NEXT:    call void @p0() #[[ATTR9]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 ;
-; CHECK-DISABLED: Function Attrs: convergent norecurse nounwind
+; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper
 ; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
@@ -2568,6 +2531,6 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
 ; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
-; CHECK-DISABLED-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index 628e75fd1b4fe..251885dc79c72 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -8,15 +8,20 @@ target triple = "nvptx64"
 
 @G = external global i32
 ;.
+; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
+; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ;.
 define weak void @kernel0() #0 {
-; CHECK-LABEL: define {{[^@]+}}@kernel0()
-; CHECK: #[[ATTR0:[0-9]+]] {
+; CHECK-LABEL: define {{[^@]+}}@kernel0
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
-; CHECK-NEXT:    call void @helper0()
-; CHECK-NEXT:    call void @helper1()
-; CHECK-NEXT:    call void @helper2()
+; CHECK-NEXT:    call void @helper0() #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @helper1() #[[ATTR1]]
+; CHECK-NEXT:    call void @helper2() #[[ATTR1]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
 ; CHECK-NEXT:    ret void
 ;
@@ -31,10 +36,10 @@ define weak void @kernel0() #0 {
 @kernel1_exec_mode = weak constant i8 1
 
 define weak void @kernel1() #0 {
-; CHECK-LABEL: define {{[^@]+}}@kernel1()
-; CHECK: #[[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@kernel1
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
-; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @helper1() #[[ATTR1]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
 ; CHECK-NEXT:    ret void
 ;
@@ -47,13 +52,13 @@ define weak void @kernel1() #0 {
 @kernel2_exec_mode = weak constant i8 1
 
 define weak void @kernel2() #0 {
-; CHECK-LABEL: define {{[^@]+}}@kernel2()
-; CHECK: #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
-; CHECK-NEXT:    call void @helper0()
-; CHECK-NEXT:    call void @helper1()
-; CHECK-NEXT:    call void @helper2()
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+; CHECK-LABEL: define {{[^@]+}}@kernel2
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper0() #[[ATTR1]]
+; CHECK-NEXT:    call void @helper1() #[[ATTR1]]
+; CHECK-NEXT:    call void @helper2() #[[ATTR1]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
@@ -65,8 +70,22 @@ define weak void @kernel2() #0 {
 }
 
 define internal void @helper0() {
-; CHECK-LABEL: define {{[^@]+}}@helper0() {{#[0-9]+}} {
+; CHECK-LABEL: define {{[^@]+}}@helper0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
+; CHECK:       region.check.tid:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK:       region.guarded:
 ; CHECK-NEXT:    store i32 666, i32* @G, align 4
+; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
+; CHECK:       region.guarded.end:
+; CHECK-NEXT:    br label [[REGION_BARRIER]]
+; CHECK:       region.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
+; CHECK:       region.exit:
 ; CHECK-NEXT:    ret void
 ;
   %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
@@ -75,7 +94,8 @@ define internal void @helper0() {
 }
 
 define internal void @helper1() {
-; CHECK-LABEL: define {{[^@]+}}@helper1() {{#[0-9]+}} {
+; CHECK-LABEL: define {{[^@]+}}@helper1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    br label [[F:%.*]]
 ; CHECK:       t:
 ; CHECK-NEXT:    unreachable
@@ -93,8 +113,22 @@ f:
 }
 
 define internal void @helper2() {
-; CHECK-LABEL: define {{[^@]+}}@helper2() {{#[0-9]+}} {
-; CHECK-NEXT:    store i32 666, i32* @G
+; CHECK-LABEL: define {{[^@]+}}@helper2
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
+; CHECK:       region.check.tid:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK:       region.guarded:
+; CHECK-NEXT:    store i32 666, i32* @G, align 4
+; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
+; CHECK:       region.guarded.end:
+; CHECK-NEXT:    br label [[REGION_BARRIER]]
+; CHECK:       region.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
+; CHECK:       region.exit:
 ; CHECK-NEXT:    ret void
 ;
   %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
@@ -117,9 +151,12 @@ attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
 !2 = !{void ()* @kernel0, !"kernel", i32 1}
 !3 = !{void ()* @kernel1, !"kernel", i32 1}
 !4 = !{void ()* @kernel2, !"kernel", i32 1}
+;
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
-;
+; CHECK: attributes #[[ATTR1]] = { nounwind }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
+;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1}

diff  --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 9144cd32ef4f7..a4bd36b3e98f6 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -128,7 +128,8 @@ define internal void @is_generic_helper2() {
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    ret void
 ; CHECK:       f:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    ret void
 ;
   %isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
   %c = icmp eq i8 %isSPMD, 0

diff  --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 08fb75d6c78e5..74a2cab0be1c6 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -23,6 +23,7 @@ define void @kernel() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 false, i1 true)
 ; CHECK-NEXT:    call void @foo() #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    call void @bar() #[[ATTR0]]
+; CHECK-NEXT:    call void @unknown_no_openmp()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
 ; CHECK-NEXT:    ret void
 ;
@@ -31,6 +32,7 @@ define void @kernel() {
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    call void @foo() #[[ATTR0:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @bar() #[[ATTR0]]
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp()
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
@@ -38,6 +40,7 @@ entry:
   %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 true, i1 true)
   call void @foo()
   call void @bar()
+  call void @unknown_no_openmp()
   call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
   ret void
 }
@@ -67,7 +70,7 @@ define internal void @bar() {
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[TMP0]], i64 noundef 4) #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@@ -75,7 +78,7 @@ define internal void @bar() {
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
-; CHECK-DISABLED-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR3:[0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR4:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(i8* [[TMP0]], i64 noundef 4) #[[ATTR0]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
@@ -146,6 +149,8 @@ declare i8* @__kmpc_alloc_shared(i64)
 ; CHECK: declare void @__kmpc_free_shared(i8* nocapture, i64)
 declare void @__kmpc_free_shared(i8*, i64)
 
+declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !6, !7}
 !nvvm.annotations = !{!5}

diff  --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index c26ae61e38c2f..7c1b71e3d041e 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
 ; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
@@ -13,15 +14,11 @@ target triple = "nvptx64"
 ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
 ; CHECK-REMARKS-NOT: 6 bytes
 
-; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
-; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
-
-; CHECK: %{{.*}} = call i8* @__kmpc_alloc_shared({{.*}})
-; CHECK: call void @__kmpc_free_shared({{.*}})
 define dso_local void @foo() {
 entry:
   %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
   %x = call i8* @__kmpc_alloc_shared(i64 4)
+  call void @unknown_no_openmp()
   %x_on_stack = bitcast i8* %x to i32*
   %0 = bitcast i32* %x_on_stack to i8*
   call void @use(i8* %0)
@@ -32,6 +29,7 @@ entry:
 
 define void @bar() {
   %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  call void @unknown_no_openmp()
   call void @baz()
   call void @qux()
   call void @negative_qux_spmd()
@@ -39,10 +37,10 @@ define void @bar() {
   ret void
 }
 
-; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED_X]], i32 0, i32 0) to i8*))
 define internal void @baz() {
 entry:
   %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 true)
+  call void @unknown_no_openmp()
   %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %master, label %exit
 master:
@@ -56,10 +54,10 @@ exit:
   ret void
 }
 
-; CHECK: call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*))
 define internal void @qux() {
 entry:
   %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true)
+  call void @unknown_no_openmp()
   %0 = icmp eq i32 %call, -1
   br i1 %0, label %master, label %exit
 master:
@@ -76,6 +74,7 @@ exit:
 define internal void @negative_qux_spmd() {
 entry:
   %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 true, i1 true)
+  call void @unknown_no_openmp()
   %0 = icmp eq i32 %call, -1
   br i1 %0, label %master, label %exit
 master:
@@ -110,6 +109,8 @@ declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
 
 declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
 
+declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !nvvm.annotations = !{!7, !8}
@@ -127,3 +128,105 @@ declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
 !10 = !DISubroutineType(types: !2)
 !11 = !DILocation(line: 5, column: 7, scope: !9)
 !12 = !DILocation(line: 5, column: 14, scope: !9)
+;.
+; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8*
+; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 32
+; CHECK: @[[Y:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+;.
+; CHECK-LABEL: define {{[^@]+}}@foo() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
+; CHECK-NEXT:    [[X:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly [[X]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR1]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bar() {
+; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
+; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @baz()
+; CHECK-NEXT:    call void @qux()
+; CHECK-NEXT:    call void @negative_qux_spmd()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@baz() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
+; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[MASTER:%.*]], label [[EXIT:%.*]]
+; CHECK:       master:
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*)) #[[ATTR4]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@qux() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true)
+; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[CALL]], -1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MASTER:%.*]], label [[EXIT:%.*]]
+; CHECK:       master:
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y, i32 0, i32 0) to i8*)) #[[ATTR4]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@negative_qux_spmd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef true, i1 noundef true, i1 noundef true)
+; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[CALL]], -1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MASTER:%.*]], label [[EXIT:%.*]]
+; CHECK:       master:
+; CHECK-NEXT:    [[Y:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 24) #[[ATTR1]], !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly [[Y]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[Y]], i64 noundef 24) #[[ATTR1]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@use.internalized
+; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@use
+; CHECK-SAME: (i8* [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0]] = { nofree nosync nounwind willreturn writeonly }
+; CHECK: attributes #[[ATTR1]] = { nounwind }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind readnone }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR4]] = { nounwind writeonly }
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c")
+; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META7:![0-9]+]] = !{void ()* @foo, !"kernel", i32 1}
+; CHECK: [[META8:![0-9]+]] = !{void ()* @bar, !"kernel", i32 1}
+; CHECK: [[DBG9]] = !DILocation(line: 5, column: 14, scope: !10)
+; CHECK: [[META10:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+; CHECK: [[META11:![0-9]+]] = !DISubroutineType(types: !2)
+;.

diff  --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index a34245cf87514..448a3f3138d45 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -49,6 +49,30 @@
 ;;     spmd_amenable();
 ;;   }
 ;; }
+;;
+;; void sequential_loop_to_shared_var_guarded() {
+;;   #pragma omp target teams
+;;   {
+;;     int x = 42;
+;;     for (int i = 0; i < 100; ++i) {
+;;       #pragma omp parallel
+;;       {
+;;         x++;
+;;         unknown();
+;;       }
+;;     }
+;;     spmd_amenable();
+;;   }
+;; }
+;;
+;; void do_not_spmdize_target() {
+;; #pragma omp target teams
+;;     {
+;;         // Incompatible parallel level, called both
+;;         // from parallel and target regions
+;;         unknown();
+;;     }
+;; }
 
 target triple = "nvptx64"
 
@@ -56,33 +80,42 @@ target triple = "nvptx64"
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
- at __omp_offloading_14_5896c35_sequential_loop_l5_exec_mode = weak constant i8 1
- at __omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20_exec_mode = weak constant i8 1
- at __omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35_exec_mode = weak constant i8 1
- at llvm.compiler.used = appending global [3 x i8*] [i8* @__omp_offloading_14_5896c35_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35_exec_mode], section "llvm.metadata"
+ at __omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode = weak constant i8 1
+ at __omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode = weak constant i8 1
+ at llvm.compiler.used = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; CHECK: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; CHECK: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
-; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [3 x i8*] [i8* @__omp_offloading_14_5896c35_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35_exec_mode], section "llvm.metadata"
+; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
+; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
 ; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; CHECK: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_5896C35_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
-; CHECK-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [3 x i8*] [i8* @__omp_offloading_14_5896c35_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35_exec_mode], section "llvm.metadata"
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
 ; CHECK-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; CHECK-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; CHECK-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
-define weak void @__omp_offloading_14_5896c35_sequential_loop_l5() #0 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_l5
+define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
@@ -99,7 +132,7 @@ define weak void @__omp_offloading_14_5896c35_sequential_loop_l5() #0 {
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_l5
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5
 ; CHECK-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
@@ -191,9 +224,9 @@ define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
@@ -219,7 +252,7 @@ define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias
 ; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-DISABLED-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-DISABLED:       for.end:
 ; CHECK-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    ret void
@@ -250,7 +283,7 @@ for.inc:                                          ; preds = %for.body
   %4 = load i32, i32* %i, align 4
   %inc = add nsw i32 %4, 1
   store i32 %inc, i32* %i, align 4
-  br label %for.cond, !llvm.loop !9
+  br label %for.cond, !llvm.loop !13
 
 for.end:                                          ; preds = %for.cond
   call void @spmd_amenable() #4
@@ -263,7 +296,7 @@ define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @unknown() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
@@ -337,8 +370,8 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3
 
 declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
 
-define weak void @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20() #0 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20
+define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
@@ -355,7 +388,7 @@ define weak void @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20()
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
@@ -432,7 +465,7 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
-; CHECK-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
 ; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
@@ -448,9 +481,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR4]]
+; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
@@ -479,7 +512,7 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-DISABLED-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-DISABLED:       for.end:
 ; CHECK-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; CHECK-DISABLED-NEXT:    ret void
@@ -513,7 +546,7 @@ for.inc:                                          ; preds = %for.body
   %4 = load i32, i32* %i, align 4
   %inc = add nsw i32 %4, 1
   store i32 %inc, i32* %i, align 4
-  br label %for.cond, !llvm.loop !11
+  br label %for.cond, !llvm.loop !15
 
 for.end:                                          ; preds = %for.cond
   call void @spmd_amenable() #4
@@ -531,7 +564,7 @@ define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    call void @unknown() #[[ATTR5]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR6]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
@@ -595,8 +628,8 @@ entry:
 
 declare void @__kmpc_free_shared(i8* nocapture, i64) #3
 
-define weak void @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35() #0 {
-; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35
+define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
@@ -613,7 +646,7 @@ define weak void @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35(
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
 ;
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
@@ -705,9 +738,9 @@ define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR4]]
+; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
@@ -735,7 +768,7 @@ define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias
 ; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-DISABLED-NEXT:    store i32 [[INC]], i32* [[I]], align 4
-; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-DISABLED:       for.end:
 ; CHECK-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; CHECK-DISABLED-NEXT:    ret void
@@ -771,7 +804,7 @@ for.inc:                                          ; preds = %for.body
   %6 = load i32, i32* %i, align 4
   %inc = add nsw i32 %6, 1
   store i32 %inc, i32* %i, align 4
-  br label %for.cond, !llvm.loop !12
+  br label %for.cond, !llvm.loop !16
 
 for.end:                                          ; preds = %for.cond
   call void @spmd_amenable() #4
@@ -790,7 +823,7 @@ define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[X]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
 ; CHECK-NEXT:    store i32 [[INC]], i32* [[X]], align 4
-; CHECK-NEXT:    call void @unknown() #[[ATTR5]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR6]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
@@ -875,6 +908,430 @@ entry:
   ret void
 }
 
+define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false)
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+; CHECK-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
+; CHECK-NEXT:    ret void
+; CHECK:       worker.exit:
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50
+; CHECK-DISABLED-SAME: () #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
+; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
+; CHECK-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
+; CHECK-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
+; CHECK-DISABLED:       worker_state_machine.begin:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
+; CHECK-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
+; CHECK-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
+; CHECK-DISABLED:       worker_state_machine.finished:
+; CHECK-DISABLED-NEXT:    ret void
+; CHECK-DISABLED:       worker_state_machine.is_active.check:
+; CHECK-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.check:
+; CHECK-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
+; CHECK-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.execute:
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
+; CHECK-DISABLED-NEXT:    call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.end:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; CHECK-DISABLED:       worker_state_machine.done.barrier:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
+; CHECK-DISABLED:       thread.user_code.check:
+; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK-DISABLED:       user_code.entry:
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-DISABLED-NEXT:    ret void
+; CHECK-DISABLED:       worker.exit:
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %1, i32* %.threadid_temp., align 4
+  call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #3
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void
+
+worker.exit:                                      ; preds = %entry
+  ret void
+}
+
+define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6
+; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32*
+; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
+; CHECK:       region.check.tid:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK:       region.guarded:
+; CHECK-NEXT:    store i32 42, i32* [[X_ON_STACK]], align 4
+; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
+; CHECK:       region.guarded.end:
+; CHECK-NEXT:    br label [[REGION_BARRIER]]
+; CHECK:       region.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
+; CHECK:       region.exit:
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+; CHECK-NEXT:    store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1)
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    call void @spmd_amenable() #[[ATTR5]]
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
+; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
+; CHECK-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32*
+; CHECK-DISABLED-NEXT:    store i32 42, i32* [[X_ON_STACK]], align 4
+; CHECK-DISABLED-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK-DISABLED:       for.cond:
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100
+; CHECK-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK-DISABLED:       for.body:
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+; CHECK-DISABLED-NEXT:    store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8
+; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1)
+; CHECK-DISABLED-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK-DISABLED:       for.inc:
+; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+; CHECK-DISABLED-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-DISABLED:       for.end:
+; CHECK-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5]]
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  %captured_vars_addrs = alloca [1 x i8*], align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x_on_stack = bitcast i8* %x to i32*
+  store i32 42, i32* %x_on_stack, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0
+  %2 = bitcast i32* %x_on_stack to i8*
+  store i8* %2, i8** %1, align 8
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  %5 = bitcast [1 x i8*]* %captured_vars_addrs to i8**
+  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %4, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %5, i64 1)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %6 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !17
+
+for.end:                                          ; preds = %for.cond
+  call void @spmd_amenable() #4
+  call void @__kmpc_free_shared(i8* %x, i64 4)
+  ret void
+}
+
+define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[X]], i32** [[X_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[X]], align 4
+; CHECK-NEXT:    call void @unknown() #[[ATTR6]]
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
+; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    [[X_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    store i32* [[X]], i32** [[X_ADDR]], align 8
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; CHECK-DISABLED-NEXT:    store i32 [[INC]], i32* [[X]], align 4
+; CHECK-DISABLED-NEXT:    call void @unknown() #[[ATTR6]]
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %x.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %x, i32** %x.addr, align 8
+  %0 = load i32*, i32** %x.addr, align 8
+  %1 = load i32, i32* %0, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %0, align 4
+  call void @unknown() #5
+  ret void
+}
+
+define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
+; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
+; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
+; CHECK-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
+; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+; CHECK-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
+; CHECK-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
+; CHECK-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+; CHECK-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
+; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
+; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
+; CHECK-DISABLED-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i32**
+  %5 = load i32*, i32** %4, align 8
+  call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr, i32* %5) #3
+  ret void
+}
+
+define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
+; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
+; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
+; CHECK:       worker_state_machine.begin:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
+; CHECK-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
+; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
+; CHECK-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
+; CHECK-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
+; CHECK:       worker_state_machine.finished:
+; CHECK-NEXT:    ret void
+; CHECK:       worker_state_machine.is_active.check:
+; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
+; CHECK:       worker_state_machine.parallel_region.fallback.execute:
+; CHECK-NEXT:    call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
+; CHECK:       worker_state_machine.parallel_region.end:
+; CHECK-NEXT:    call void @__kmpc_kernel_end_parallel()
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; CHECK:       worker_state_machine.done.barrier:
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
+; CHECK:       thread.user_code.check:
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-NEXT:    ret void
+; CHECK:       worker.exit:
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65
+; CHECK-DISABLED-SAME: () #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
+; CHECK-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true)
+; CHECK-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
+; CHECK-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
+; CHECK-DISABLED:       worker_state_machine.begin:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
+; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
+; CHECK-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
+; CHECK-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
+; CHECK-DISABLED:       worker_state_machine.finished:
+; CHECK-DISABLED-NEXT:    ret void
+; CHECK-DISABLED:       worker_state_machine.is_active.check:
+; CHECK-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
+; CHECK-DISABLED-NEXT:    call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
+; CHECK-DISABLED:       worker_state_machine.parallel_region.end:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
+; CHECK-DISABLED:       worker_state_machine.done.barrier:
+; CHECK-DISABLED-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; CHECK-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
+; CHECK-DISABLED:       thread.user_code.check:
+; CHECK-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK-DISABLED:       user_code.entry:
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-DISABLED-NEXT:    ret void
+; CHECK-DISABLED:       worker.exit:
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %1, i32* %.threadid_temp., align 4
+  call void @__omp_outlined__8(i32* %.threadid_temp., i32* %.zero.addr) #3
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void
+
+worker.exit:                                      ; preds = %entry
+  ret void
+}
+
+define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8
+; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    call void @unknown() #[[ATTR6]]
+; CHECK-NEXT:    ret void
+;
+; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
+; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-DISABLED-NEXT:  entry:
+; CHECK-DISABLED-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-DISABLED-NEXT:    call void @unknown() #[[ATTR6]]
+; CHECK-DISABLED-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  call void @unknown() #5
+  ret void
+}
+
 attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 attributes #1 = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 attributes #2 = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
@@ -882,30 +1339,36 @@ attributes #3 = { nounwind }
 attributes #4 = { convergent "llvm.assume"="ompx_spmd_amenable" }
 attributes #5 = { convergent }
 
-!omp_offload.info = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5}
-!llvm.module.flags = !{!6, !7, !8}
-
-!0 = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_stack_var", i32 20, i32 1}
-!1 = !{i32 0, i32 20, i32 92892213, !"sequential_loop", i32 5, i32 0}
-!2 = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_shared_var", i32 35, i32 2}
-!3 = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_l5, !"kernel", i32 1}
-!4 = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-!5 = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-!6 = !{i32 1, !"wchar_size", i32 4}
-!7 = !{i32 7, !"openmp", i32 50}
-!8 = !{i32 7, !"openmp-device", i32 50}
-!9 = distinct !{!9, !10}
-!10 = !{!"llvm.loop.mustprogress"}
-!11 = distinct !{!11, !10}
-!12 = distinct !{!12, !10}
+!omp_offload.info = !{!0, !1, !2, !3, !4}
+!nvvm.annotations = !{!5, !6, !7, !8, !9}
+!llvm.module.flags = !{!10, !11, !12}
+
+!0 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1}
+!1 = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0}
+!2 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2}
+!3 = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4}
+!4 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
+!5 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1}
+!6 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
+!7 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
+!8 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
+!9 = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"openmp", i32 50}
+!12 = !{i32 7, !"openmp-device", i32 50}
+!13 = distinct !{!13, !14}
+!14 = !{!"llvm.loop.mustprogress"}
+!15 = distinct !{!15, !14}
+!16 = distinct !{!16, !14}
+!17 = distinct !{!17, !14}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 ; CHECK: attributes #[[ATTR3]] = { nounwind }
-; CHECK: attributes #[[ATTR4]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
-; CHECK: attributes #[[ATTR5]] = { convergent }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nounwind }
+; CHECK: attributes #[[ATTR5]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
+; CHECK: attributes #[[ATTR6]] = { convergent }
 ;.
 ; CHECK-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
 ; CHECK-DISABLED: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" }
@@ -915,31 +1378,41 @@ attributes #5 = { convergent }
 ; CHECK-DISABLED: attributes #[[ATTR5]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
 ; CHECK-DISABLED: attributes #[[ATTR6]] = { convergent }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_stack_var", i32 20, i32 1}
-; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop", i32 5, i32 0}
-; CHECK: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_shared_var", i32 35, i32 2}
-; CHECK: [[META3:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_l5, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[LOOP9]] = distinct !{!9, !10}
-; CHECK: [[META10:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; CHECK: [[LOOP11]] = distinct !{!11, !10}
-; CHECK: [[LOOP12]] = distinct !{!12, !10}
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0}
+; CHECK: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2}
+; CHECK: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4}
+; CHECK: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
+; CHECK: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1}
+; CHECK: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
+; CHECK: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
+; CHECK: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
+; CHECK: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1}
+; CHECK: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[LOOP13]] = distinct !{!13, !14}
+; CHECK: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[LOOP15]] = distinct !{!15, !14}
+; CHECK: [[LOOP16]] = distinct !{!16, !14}
+; CHECK: [[LOOP17]] = distinct !{!17, !14}
 ;.
-; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_stack_var", i32 20, i32 1}
-; CHECK-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop", i32 5, i32 0}
-; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 92892213, !"sequential_loop_to_shared_var", i32 35, i32 2}
-; CHECK-DISABLED: [[META3:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_l5, !"kernel", i32 1}
-; CHECK-DISABLED: [[META4:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; CHECK-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_5896c35_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK-DISABLED: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK-DISABLED: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK-DISABLED: [[LOOP9]] = distinct !{!9, !10}
-; CHECK-DISABLED: [[META10:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; CHECK-DISABLED: [[LOOP11]] = distinct !{!11, !10}
-; CHECK-DISABLED: [[LOOP12]] = distinct !{!12, !10}
+; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1}
+; CHECK-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0}
+; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2}
+; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4}
+; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
+; CHECK-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1}
+; CHECK-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
+; CHECK-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
+; CHECK-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
+; CHECK-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1}
+; CHECK-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK-DISABLED: [[LOOP13]] = distinct !{!13, !14}
+; CHECK-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"}
+; CHECK-DISABLED: [[LOOP15]] = distinct !{!15, !14}
+; CHECK-DISABLED: [[LOOP16]] = distinct !{!16, !14}
+; CHECK-DISABLED: [[LOOP17]] = distinct !{!17, !14}
 ;.


        


More information about the llvm-commits mailing list